In [1]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer

import pandas as pd
import numpy as np
import re
import os
import sys
import shutil
import json

import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/darshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
local_model_base_dir = '../../local_models/'
# Location of the model with single-tokens
model_name = 'bert-base-uncased_option1_with_bertram_bt2'
model_checkpoint_dir = local_model_base_dir + model_name
print(f'Model path:{model_checkpoint_dir}')

Model path:../../local_models/bert-base-uncased_option1_with_bertram_bt2


In [3]:
wordlist_set_NAME = 'set1'
wordlist_set_json_file = '../../data/emb_visuals/wordlist_set1.json'

# The common set of tokens, paraphrases and words to be considered in the final output
with open(wordlist_set_json_file, 'r') as jfile:
    WORDLIST_DICT = json.load(jfile)
print(WORDLIST_DICT)

{'PIE_list': ['IDoffthebeatentrackID', 'IDintherunningID', 'IDtakerootID', 'IDcleanhouseID', 'IDmakehistoryID', 'IDgoallthewayID', 'IDchapterandverseID', 'IDbreakthebankID', 'IDheadforthehillsID', 'IDinafogID', 'IDbringuptherearID', 'IDintheholeID', 'IDtruetoformID', 'IDontheballID', 'IDstakeaclaimID', 'IDupforgrabsID', 'IDupandrunningID', 'IDbehindbarsID', 'IDinmybookID', 'IDinblackandwhiteID', 'IDgoupinsmokeID', 'IDuptheriverID', 'IDmakeyourbloodboilID', 'IDrubshouldersID', 'IDgospareID', 'IDspotonID', 'IDnotcricketID', 'IDintheclubID', 'IDundertheweatherID', 'IDplaywithfireID', 'IDringabellID', 'IDputthebootinID', 'IDsmallbeerID', 'IDturnablindeyeID', 'IDintheblackID', 'IDonthenoseID', 'IDrankandfileID', 'IDpointthefingerID', 'IDoffandonID', 'IDgototownID', 'IDoneofthosethingsID', 'IDkeepthepeaceID', 'IDbydintofID', 'IDmakelightofID', 'IDoutofkilterID', 'IDgettogripswithID', 'IDpresenceofmindID', 'IDoverthehillID', 'IDgrindtoahaltID', 'IDcutthecrapID', 'IDnolaughingmatterID', 'IDgre

In [4]:
# Experiment name
exp_name = 'bt2'

IS_BERTRAM_FORMAT = True

token_PIE_mapping_file = '../../data/token_files/option1_idioms.csv'

In [5]:
print(f"Should BERTRAM format be used: {IS_BERTRAM_FORMAT}")

Should BERTRAM format be used: True


In [6]:
# Output directory
dump_dir = './embedding_dump_full/'

In [7]:
# Load the PIEs and token strings
df_pie_token_mapping = pd.read_csv(token_PIE_mapping_file)

# Consider only those PIEs that present in the WORDLIST_DICT
df_pie_token_mapping = df_pie_token_mapping[df_pie_token_mapping['idiom_token'].isin(WORDLIST_DICT['PIE_list'])]

ID_BERTRAM_map = None
if IS_BERTRAM_FORMAT:
    # Convert the tokens to <BERTRAM:...> format
    df_pie_token_mapping['idiom_token'] = df_pie_token_mapping['idiom_token'].map(lambda t: f"<BERTRAM:{t}>")
    # For future use
    ID_BERTRAM_map = {f"<BERTRAM:{pie}>" : pie for pie in WORDLIST_DICT['PIE_list']}
    WORDLIST_DICT['PIE_list'] = [key for key,val in ID_BERTRAM_map.items()]

    print('Converted to BERTRAM format!')

display(df_pie_token_mapping)

Converted to BERTRAM format!


Unnamed: 0,idiom,idiom_token
0,off the beaten track,<BERTRAM:IDoffthebeatentrackID>
1,in the running,<BERTRAM:IDintherunningID>
4,take root,<BERTRAM:IDtakerootID>
5,clean house,<BERTRAM:IDcleanhouseID>
6,make history,<BERTRAM:IDmakehistoryID>
...,...,...
1728,spread yourself too thin,<BERTRAM:IDspreadyourselftoothinID>
1730,beat the rap,<BERTRAM:IDbeattherapID>
1732,practise what you preach,<BERTRAM:IDpractisewhatyoupreachID>
1734,top banana,<BERTRAM:IDtopbananaID>


## Consider the words in the MAGPIE corpus

**Consider only those sentences where the current list of PIEs are used**

In [8]:
en_stopwords = stopwords.words('english')
punc_remo_trans = str.maketrans('', '', string.punctuation)

en_stopwords.extend(WORDLIST_DICT['PIE_list'])
en_stopwords = {s.lower() for s in en_stopwords}

# Load the magpie training set
MAGPIE_FULL_FILE = './tmp/magpie_full_exp3A_1.csv'
df_magpie = pd.read_csv(MAGPIE_FULL_FILE)
sent_list = df_magpie['sentence_0'].values

MIN_COUNT = 4
MAX_COUNT = 6
MAGPIE_WORD_SET = set()
# Add all the unique, non-stop words to a list (exclude the single tokens as well)
word_counter = defaultdict(int)
for sent in sent_list:
    # Consider only those sentences that have the current list of PIEs
    found=False
    for pie in WORDLIST_DICT['PIE_list']:
        if pie in sent:
            found=True
            break
    if found:
        sent = sent.translate(punc_remo_trans)
        words = [word for word in sent.lower().split() if word not in en_stopwords]
        words = [word for word in words if word.isalpha() and len(word)>2 and len(word)<=12]
        # Count the word occurences
        for word in words:
            word_counter[word] += 1

# Filter out very frequent and very rare words
final_words = [word for word,count in word_counter.items() if count > MIN_COUNT and count < MAX_COUNT]
MAGPIE_WORD_SET = set(final_words)


print(f"Obtained {len(MAGPIE_WORD_SET)} words from MAGPIE corpus")

Obtained 1459 words from MAGPIE corpus


In [9]:
# print(sorted(word_counter.items(), key=lambda p: p[1]))
len(MAGPIE_WORD_SET)

1459

In [10]:
print(MAGPIE_WORD_SET)

{'hierarchical', 'resorts', 'lodged', 'rearing', 'blindly', 'scaffolding', 'crowe', 'prosecuting', 'downhill', 'rem', 'peters', 'psyche', 'biological', 'adrian', 'supervisor', 'basil', 'arson', 'noting', 'flourished', 'beams', 'whirlwind', 'rains', 'sweetest', 'mutations', 'periodic', 'outraged', 'transferring', 'sculptures', 'tandem', 'lavatory', 'shortest', 'kitty', 'referrals', 'modify', 'dangerously', 'sailor', 'drawers', 'whisper', 'disrepute', 'aching', 'zuwaya', 'fourteenth', 'cnaa', 'shamrock', 'replaces', 'covent', 'monument', 'procession', 'pumped', 'beware', 'forties', 'graf', 'stevie', 'shark', 'deference', 'dell', 'teens', 'sideboard', 'kleenex', 'turbo', 'clung', 'soaking', 'ave', 'nazi', 'attachment', 'crony', 'slamming', 'smoked', 'lobster', 'trifle', 'altar', 'ordeal', 'deferential', 'guerrillas', 'fracture', 'pepper', 'unrest', 'roadside', 'neglected', 'liberties', 'resilience', 'kissing', 'evaluating', 'watchers', 'instinctive', 'dinners', 'belinda', 'melbourne', 're

In [11]:
# Output directory check
if os.path.isdir(dump_dir):
    raise Exception(f"Output directory {dump_dir} already exists!")
else:
    os.makedirs(dump_dir)

In [12]:
# Load the BERT model & tokenizers
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_dir)
# Download the Tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_dir, use_fast=True, truncation=True)
print(f"Loaded both the LM Model & the Tokenizer models")

Some weights of the model checkpoint at ../../local_models/bert-base-uncased_option1_with_bertram_bt2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../../local_mod

Loaded both the LM Model & the Tokenizer models


In [13]:
# Get the embedding matrix
embedding_weights = model.bert.embeddings.word_embeddings.weight
embedding_weights.shape

torch.Size([31783, 768])

# Create tokens-id mapping for all the tokens
We need to get the embeddings for all the tokens and the constituent words of the PIEs.

In [14]:
#To store PIE singel tokens
single_token_to_id_mapping = {}
# To store constituent words of a PIE
word_to_ids = {}

for i,(pie, token_str) in df_pie_token_mapping.iterrows():
    # First, get the id-token mapping for 'token_str'
    if IS_BERTRAM_FORMAT:
        if token_str in tokenizer.vocab: 
            token_id = tokenizer.vocab[token_str]
        else:
            print(f'ERROR! {token_str} not found!')
            token_id = tokenizer.unk_token_id
    else:
        token_id = tokenizer.vocab[token_str.lower()]
    # Add to the dict
    single_token_to_id_mapping[token_str] = token_id
    
    # Next, process the individual words in the pie, find their token ids
    pie_words = [pword.strip() for pword in pie.split() if pword not in word_to_ids]
    for pword in pie_words:
        token_ids = tokenizer.encode(pword, add_special_tokens=False)
        word_to_ids[pword] = token_ids
        
print(f"Got token ids for {len(single_token_to_id_mapping)} PIE single tokens and {len(word_to_ids)} words")

Got token ids for 1261 PIE single tokens and 1292 words


In [15]:
tokenizer.vocab

{'pits': 14496,
 '省': 1920,
 'rumble': 15658,
 'gunmen': 28932,
 'kathleen': 14559,
 '##լ': 29773,
 'lords': 8140,
 'wwf': 16779,
 'closes': 14572,
 'expo': 16258,
 '##प': 29864,
 'tariffs': 26269,
 'mark': 2928,
 'fife': 20537,
 'و': 1298,
 'fred': 5965,
 'spaces': 7258,
 '##inging': 23180,
 'lace': 12922,
 'pmid': 20117,
 'antenna': 13438,
 'tighter': 12347,
 '[unused175]': 180,
 'ᵉ': 1499,
 '[unused302]': 307,
 'taxation': 14952,
 '308': 24232,
 'interceptor': 24727,
 '[unused917]': 922,
 '<BERTRAM:IDneveryoumindID>': 31207,
 'motown': 22654,
 'succeeding': 13034,
 'bean': 14068,
 '##rish': 18774,
 '##otted': 26174,
 'charges': 5571,
 'cried': 6639,
 'capitol': 9424,
 'uranium': 14247,
 'mansfield': 15352,
 'intuition': 26406,
 'viaduct': 20596,
 'prophecy': 14951,
 '1791': 14362,
 'brendan': 15039,
 'hitter': 18694,
 'cricket': 4533,
 'unit': 3131,
 '##bia': 11607,
 'edo': 18314,
 'bombardier': 29143,
 'disdain': 25134,
 '<BERTRAM:IDonthespurofthemomentID>': 31300,
 'had': 2018,
 '

In [16]:
# Max possible length of a line containing tab separated embeddings, a rough estimate
MAX_EMB_LINE_LENGTH = (64 + 1)*768

def get_single_token_embeddings(tok_id):
    emb_vec = embedding_weights[tok_id].detach().numpy()
    return emb_vec

def get_embedding_string(emb_vec):
    """
    # Convert one embedding vector from numpy array into a tab separated string format
    # NOTE: The float64 precision is used here!!
    """
    emb_str = np.array2string(emb_vec, separator='\t', \
                              max_line_width=MAX_EMB_LINE_LENGTH, \
                              formatter={'float_kind':lambda x: str(np.float64(x))}, \
                              suppress_small=False, floatmode='maxprec')
    # Trim [ and ] characters
    emb_str = emb_str[1:-1]
    return emb_str

## Word to Embedding string mapping
Create a map of <tok_word, embedding_str> for both pie single-tokens and the constituent words

In [17]:
word_emb_str_mapping = {}

# First process all the pie single-tokens
for tok_word, tok_id in single_token_to_id_mapping.items():
    emb_vec = get_single_token_embeddings(tok_id)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[tok_word] = emb_str
    
# Then process all the constituent words(Note: we have array of subtokens per each word!)
def get_average_embedding(token_ids):
    all_emb_vecs = []
    for tok_id in token_ids:
        emb_vec = get_single_token_embeddings(tok_id)
        all_emb_vecs.append(emb_vec)
    np_embs = np.array(all_emb_vecs)
    avg_emb = np_embs.mean(axis=0)
    return avg_emb
    
for word, tok_ids in word_to_ids.items():
    emb_vec = get_average_embedding(tok_ids)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[word] = emb_str
    
print(f"Created word-embedding string mapping for {len(word_emb_str_mapping)} tokens")

Created word-embedding string mapping for 2553 tokens


### Add sample paraphrases and synonyms
Add the paraphrases and synonyms as well. The embeddings are obtained by averaging the embeddings of the subtokens.

In [18]:
COMMON_PARAPHRASES = WORDLIST_DICT['paraphrases']
for paraphrase in COMMON_PARAPHRASES:
    # Get the average embeddings for each paraphrase
    tok_ids = tokenizer.encode(paraphrase, add_special_tokens=False)
    emb_vec = get_average_embedding(tok_ids)
    para_emb_str = get_embedding_string(emb_vec)

    # Append to the mapping dictionary
    word_emb_str_mapping[paraphrase] = para_emb_str
    
print("Added additional paraphrases & synonyms of sample PIEs.")
print(f"The final word-embedding string mapping contains {len(word_emb_str_mapping)} tokens")

Added additional paraphrases & synonyms of sample PIEs.
The final word-embedding string mapping contains 2553 tokens


### Add the MAGPIE words as well
Add all the words selected from MAGPIE corpus

In [19]:
for mword in MAGPIE_WORD_SET:
    # Get the average embeddings for each word
    tok_ids = tokenizer.encode(mword, add_special_tokens=False)
    emb_vec = get_average_embedding(tok_ids)
    mword_emb_str = get_embedding_string(emb_vec)

    # Append to the mapping dictionary
    word_emb_str_mapping[mword] = mword_emb_str
    
print("Added words from MAGPIE corpus!")
print(f"The final word-embedding string mapping contains {len(word_emb_str_mapping)} tokens")

Added words from MAGPIE corpus!
The final word-embedding string mapping contains 3980 tokens


## Save the tokens and embedding strings into separate files

In [20]:
# Maintain a very specific order of words and be consistent across different experiments!!!
# 1. Combine the PIEs, paraphrases, the words from WORDLIST_DICT and the MAGPIE words 
# 2. Remove duplicates!!
# 3. and sort them!!

final_word_set = set(WORDLIST_DICT['PIE_list'] + WORDLIST_DICT['paraphrases'] + WORDLIST_DICT['words'] + list(MAGPIE_WORD_SET))
SORTED_word_list = list(sorted(final_word_set))

print(f'Final number of words:{len(SORTED_word_list)}')

Final number of words:2720


In [21]:
outfile_prefix = exp_name + '_' + wordlist_set_NAME
word_file_path = os.path.join(dump_dir, outfile_prefix + '_words.tsv')
embedding_file_path = os.path.join(dump_dir, outfile_prefix + '_vectors.tsv')

# For every token in the additional tokens, get the embedding vector
# Append the token to word_file, append the tab-separated emebeddings to embeddings_file
with open(word_file_path, 'a') as word_file:
    with open(embedding_file_path, 'a') as embedding_file:
        # Write to the file in the exact same order!!
        for tok_word in SORTED_word_list:
            output_tok_word = None
            if tok_word.startswith("<BERTRAM:"):
                output_tok_word = ID_BERTRAM_map[tok_word]
            else:
                output_tok_word = tok_word

            # Save tok_word
            word_file.write(output_tok_word + '\n')
            
            # Get the corresponding embedding
            emb_str = word_emb_str_mapping[tok_word]
            # Save emb_str
            embedding_file.write(emb_str + '\n')

print(f'Saved word and embedding TSV files at {dump_dir}')

Saved word and embedding TSV files at ./embedding_dump_full/
