In [1]:
local_model_base_dir = '../../local_models/'
# Location of the model with single-tokens
model_name = 'bert-base-uncased_SequenceClassification_STR_option1'
model_checkpoint_dir = local_model_base_dir + model_name
print(f'Model path:{model_checkpoint_dir}')

Model path:../../local_models/bert-base-uncased_SequenceClassification_STR_option1


In [2]:
# Experiment name
exp_name = 'exp3A_1'

IS_BERTRAM_FORMAT = False

token_PIE_mapping_file = '../../data/token_files/option1_idioms.csv'

In [3]:
print(f"Should BERTRAM format be used: {IS_BERTRAM_FORMAT}")

Should BERTRAM format be used: False


In [4]:
# Output directory
dump_dir = './embedding_dump/'

# Add paraphrases of sample PIEs
In addition to the PIE single tokens and their constituent words, synonyms and paraphrases for some of the interesting PIEs are also included in the embedding space for the study.

In [5]:
paraphrases_and_synonyms = set()

nctti_synonyms = {
 'absolute lowest', 'all time low', 'basement', 'bliss', 'bottom floor', 'cheapest', 'cliffhanger', 'close one',
 'close shave', 'crazy', 'dated', 'defenseless', 'down low', 'easy prey', 'easy target', 'euphoria', 'ex girlfriend', 'exposed',
 'first floor', 'first story', 'food chain', 'former lover', 'ground level', 'heaven', 'helpless', 'hierarchy', 'in heaven',
 'inactive person', 'inconspicuous', 'land', 'lazy', 'lazy person', 'lost cause', 'low key', 'lowest point', 'near miss',
 'nervous wreck', 'old fashioned', 'old love', 'old lover', 'old news', 'old-fashioned', 'out of date', 'past love',
 'politically unstable', 'pushover', 'sedentary individual', 'shy', 'shy person', 'small nation', 'sweet', 'third world country',
 'top of the world', 'uninteresting', 'wallflower', 'wuss'
}

#"keep a low profile", 
#"pecking order",
#"old hat",
#"close call",
#"rock bottom",
#"basket case",
#"on cloud nine",
#"get in on the ground floor",
#"couch potato",
#"shrinking violet",
#"sitting duck",
#"an old flame",
#"banana republic",


interesting_PIEs = {
    "behind bars", #in prison
    "meet and talk in a friendly way", #rub shoulders
    "to awaken", "recall something", #ring a bell
    "take part in dangerous undertaking", #play with fire
    "to forgive someone", #let bygones be bygones
    "busy", #chase your tail 
    "die", "passed away", #kick the bucket
    "disclose a secret", "reveal something", #spill the beans
    "spill the coffee"
}

paraphrases_and_synonyms.update(nctti_synonyms)
paraphrases_and_synonyms.update(interesting_PIEs)
paraphrases_and_synonyms

{'absolute lowest',
 'all time low',
 'basement',
 'behind bars',
 'bliss',
 'bottom floor',
 'busy',
 'cheapest',
 'cliffhanger',
 'close one',
 'close shave',
 'crazy',
 'dated',
 'defenseless',
 'die',
 'disclose a secret',
 'down low',
 'easy prey',
 'easy target',
 'euphoria',
 'ex girlfriend',
 'exposed',
 'first floor',
 'first story',
 'food chain',
 'former lover',
 'ground level',
 'heaven',
 'helpless',
 'hierarchy',
 'in heaven',
 'inactive person',
 'inconspicuous',
 'land',
 'lazy',
 'lazy person',
 'lost cause',
 'low key',
 'lowest point',
 'meet and talk in a friendly way',
 'near miss',
 'nervous wreck',
 'old fashioned',
 'old love',
 'old lover',
 'old news',
 'old-fashioned',
 'out of date',
 'passed away',
 'past love',
 'politically unstable',
 'pushover',
 'recall something',
 'reveal something',
 'sedentary individual',
 'shy',
 'shy person',
 'small nation',
 'spill the coffee',
 'sweet',
 'take part in dangerous undertaking',
 'third world country',
 'to awak

In [6]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer

import pandas as pd
import numpy as np
import re
import os
import sys
import shutil

In [7]:
# Load the PIEs and token strings
df_pie_token_mapping = pd.read_csv(token_PIE_mapping_file)
if IS_BERTRAM_FORMAT:
    # Convert the tokens to <BERTRAM:...> format
    df_pie_token_mapping['idiom_token'] = df_pie_token_mapping['idiom_token'].map(lambda t: f"<BERTRAM:{t}>")
    print('Converted to BERTRAM format')

df_pie_token_mapping.head()

Unnamed: 0,idiom,idiom_token
0,off the beaten track,IDoffthebeatentrackID
1,in the running,IDintherunningID
2,give someone the creeps,IDgivesomeonethecreepsID
3,do someone proud,IDdosomeoneproudID
4,take root,IDtakerootID


In [8]:
# Output directory check
if os.path.isdir(dump_dir):
    raise Exception(f"Output directory {dump_dir} already exists!")
else:
    os.makedirs(dump_dir)

In [9]:
# Load the BERT model & tokenizers
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_dir)
# Download the Tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_dir, use_fast=True, truncation=True)
print(f"Loaded both the LM Model & the Tokenizer models")

Loaded both the LM Model & the Tokenizer models


In [10]:
# Get the embedding matrix
embedding_weights = model.bert.embeddings.word_embeddings.weight
embedding_weights.shape

torch.Size([32260, 768])

# Create tokens-id mapping for all the tokens
We need to get the embeddings for all the tokens and the constituent words of the PIEs.

In [11]:
#To store PIE singel tokens
id_to_single_token_mapping = {}
# To store constituent words of a PIE
word_to_ids = {}

for i,(pie, token_str) in df_pie_token_mapping.iterrows():
    # First, get the id-token mapping for 'token_str'
    token_id = tokenizer.vocab[token_str.lower()]
    # Add to the dict
    id_to_single_token_mapping[token_id] = token_str
    
    # Next, process the individual words in the pie, find their token ids
    pie_words = [pword.strip() for pword in pie.split() if pword not in word_to_ids]
    for pword in pie_words:
        token_ids = tokenizer.encode(pword, add_special_tokens=False)
        word_to_ids[pword] = token_ids
        
print(f"Got token ids for {len(id_to_single_token_mapping)} PIE single tokens and {len(word_to_ids)} words")

Got token ids for 1738 PIE single tokens and 1735 words


In [12]:
# Max possible length of a line containing tab separated embeddings, a rough estimate
MAX_EMB_LINE_LENGTH = (64 + 1)*768

def get_single_token_embeddings(tok_id):
    emb_vec = embedding_weights[tok_id].detach().numpy()
    return emb_vec

def get_embedding_string(emb_vec):
    """
    # Convert one embedding vector from numpy array into a tab separated string format
    # NOTE: The float64 precision is used here!!
    """
    emb_str = np.array2string(emb_vec, separator='\t', \
                              max_line_width=MAX_EMB_LINE_LENGTH, \
                              formatter={'float_kind':lambda x: str(np.float64(x))}, \
                              suppress_small=False, floatmode='maxprec')
    # Trim [ and ] characters
    emb_str = emb_str[1:-1]
    return emb_str

## Word to Embedding string mapping
Create a map of <tok_word, embedding_str> for both pie single-tokens and the constituent words

In [13]:
word_emb_str_mapping = {}

# First process all the pie single-tokens
for tok_id, tok_word in id_to_single_token_mapping.items():
    emb_vec = get_single_token_embeddings(tok_id)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[tok_word] = emb_str
    
# Then process all the constituent words(Note: we have array of subtokens per each word!)
def get_average_embedding(token_ids):
    all_emb_vecs = []
    for tok_id in token_ids:
        emb_vec = get_single_token_embeddings(tok_id)
        all_emb_vecs.append(emb_vec)
    np_embs = np.array(all_emb_vecs)
    avg_emb = np_embs.mean(axis=0)
    return avg_emb
    
for word, tok_ids in word_to_ids.items():
    emb_vec = get_average_embedding(tok_ids)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[word] = emb_str
    
print(f"Created word-embedding string mapping for {len(word_emb_str_mapping)} tokens")

Created word-embedding string mapping for 3473 tokens


### Add sample paraphrases and synonyms
Add the paraphrases and synonyms as well. The embeddings are obtained by averaging the embeddings of the subtokens.

In [14]:
for paraphrase in paraphrases_and_synonyms:
    # Get the average embeddings for each paraphrase
    tok_ids = tokenizer.encode(paraphrase, add_special_tokens=False)
    emb_vec = get_average_embedding(tok_ids)
    para_emb_str = get_embedding_string(emb_vec)

    # Append to the mapping dictionary
    word_emb_str_mapping[paraphrase] = para_emb_str
    
print("Added additional paraphrases & synonyms of sample PIEs.")
print(f"The final word-embedding string mapping contains {len(word_emb_str_mapping)} tokens")

Added additional paraphrases & synonyms of sample PIEs.
The final word-embedding string mapping contains 3536 tokens


## Save the tokens and embedding strings into separate files

In [15]:
word_file_path = os.path.join(dump_dir, exp_name+'_words.tsv')
embedding_file_path = os.path.join(dump_dir, exp_name+'_vectors.tsv')

# For every token in the additional tokens, get the embedding vector
# Append the token to word_file, append the tab-separated emebeddings to embeddings_file
with open(word_file_path, 'a') as word_file:
    with open(embedding_file_path, 'a') as embedding_file:
        for tok_word, emb_str in word_emb_str_mapping.items():
            # Save tok_word
            word_file.write(tok_word + '\n')
            # Save emb_str
            embedding_file.write(emb_str + '\n')

print(f'Saved word and embedding TSV files at {dump_dir}')

Saved word and embedding TSV files at ./embedding_dump/
