In [1]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer

import pandas as pd
import numpy as np
import re
import os
import sys
import shutil
import json

import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/darshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
local_model_base_dir = '../../local_models/'
# Location of the model with single-tokens
model_name = 'bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained'
model_checkpoint_dir = local_model_base_dir + model_name
print(f'Model path:{model_checkpoint_dir}')

Model path:../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained


In [3]:
wordlist_set_NAME = 'set1'
wordlist_set_json_file = '../../data/emb_visuals/wordlist_set1.json'

# The common set of tokens, paraphrases and words to be considered in the final output
with open(wordlist_set_json_file, 'r') as jfile:
    WORDLIST_DICT = json.load(jfile)
print(WORDLIST_DICT)

{'PIE_list': ['IDmybadID', 'IDgameonID', 'IDchaseyourtailID', 'IDstealtheshowID', 'IDfromscratchID', 'IDlendahandID', 'IDatseaID', 'IDbiteyourlipID', 'IDdaylightrobberyID', 'IDactofgodID', 'IDbreaktheiceID', 'IDrunamileID', 'IDunderthetableID', 'IDholdyourtongueID', 'IDputthekiboshonID', 'IDindutchID', 'IDinthedriversseatID'], 'paraphrases': ['contest', 'ready for something', 'rush around ineffectually', 'very busy', 'center of attention', 'outshine', 'from very beginning', 'assist', 'help', 'confused', 'puzzled', 'repress an emotion', 'unfair trade', 'victim', 'severe natural event', 'relieve tension', 'start a conversation', 'reluctant', 'extremely unwilling', 'secretly or covertly', 'very drunk', 'remain silent', 'put an end to', 'check', 'curb', 'stop', 'in trouble', 'in disfavor', 'be in control', "in the driver's seat", 'make the decisions'], 'words': []}


In [4]:
# Experiment name
exp_name = 'exp3B_1'

IS_BERTRAM_FORMAT = False

token_PIE_mapping_file = '../../data/token_files/option1_idioms.csv'

In [5]:
print(f"Should BERTRAM format be used: {IS_BERTRAM_FORMAT}")

Should BERTRAM format be used: False


In [6]:
# Output directory
dump_dir = './embedding_dump/'

# Add paraphrases of sample PIEs
In addition to the PIE single tokens and their constituent words, synonyms and paraphrases for some of the interesting PIEs are also included in the embedding space for the study.

In [7]:
# paraphrases_and_synonyms = set()

# nctti_synonyms = {
#  'absolute lowest', 'all time low', 'basement', 'bliss', 'bottom floor', 'cheapest', 'cliffhanger', 'close one',
#  'close shave', 'crazy', 'dated', 'defenseless', 'down low', 'easy prey', 'easy target', 'euphoria', 'ex girlfriend', 'exposed',
#  'first floor', 'first story', 'food chain', 'former lover', 'ground level', 'heaven', 'helpless', 'hierarchy', 'in heaven',
#  'inactive person', 'inconspicuous', 'land', 'lazy', 'lazy person', 'lost cause', 'low key', 'lowest point', 'near miss',
#  'nervous wreck', 'old fashioned', 'old love', 'old lover', 'old news', 'old-fashioned', 'out of date', 'past love',
#  'politically unstable', 'pushover', 'sedentary individual', 'shy', 'shy person', 'small nation', 'sweet', 'third world country',
#  'top of the world', 'uninteresting', 'wallflower', 'wuss'
# }

# #"keep a low profile", 
# #"pecking order",
# #"old hat",
# #"close call",
# #"rock bottom",
# #"basket case",
# #"on cloud nine",
# #"get in on the ground floor",
# #"couch potato",
# #"shrinking violet",
# #"sitting duck",
# #"an old flame",
# #"banana republic",


# interesting_PIEs = {
#     "behind bars", #in prison
#     "meet and talk in a friendly way", #rub shoulders
#     "to awaken", "recall something", #ring a bell
#     "take part in dangerous undertaking", #play with fire
#     "to forgive someone", #let bygones be bygones
#     "busy", #chase your tail 
#     "die", "passed away", #kick the bucket
#     "disclose a secret", "reveal something", #spill the beans
#     "spill the coffee"
# }

# paraphrases_and_synonyms.update(nctti_synonyms)
# paraphrases_and_synonyms.update(interesting_PIEs)
# paraphrases_and_synonyms

In [8]:
# Load the PIEs and token strings
df_pie_token_mapping = pd.read_csv(token_PIE_mapping_file)

# Consider only those PIEs that present in the WORDLIST_DICT
df_pie_token_mapping = df_pie_token_mapping[df_pie_token_mapping['idiom_token'].isin(WORDLIST_DICT['PIE_list'])]

if IS_BERTRAM_FORMAT:
    # Convert the tokens to <BERTRAM:...> format
    df_pie_token_mapping['idiom_token'] = df_pie_token_mapping['idiom_token'].map(lambda t: f"<BERTRAM:{t}>")
    WORDLIST_DICT['PIE_list'] = [f"<BERTRAM:{pie}>" for pie in WORDLIST_DICT['PIE_list']]

    print('Converted to BERTRAM format!')

display(df_pie_token_mapping)

Unnamed: 0,idiom,idiom_token
33,act of God,IDactofgodID
111,from scratch,IDfromscratchID
114,chase your tail,IDchaseyourtailID
131,under the table,IDunderthetableID
153,at sea,IDatseaID
179,game on,IDgameonID
362,run a mile,IDrunamileID
500,break the ice,IDbreaktheiceID
674,in Dutch,IDindutchID
713,lend a hand,IDlendahandID


## Consider the words in the MAGPIE corpus

**Consider only those sentences where the current list of PIEs are used**

In [9]:
en_stopwords = stopwords.words('english')
punc_remo_trans = str.maketrans('', '', string.punctuation)

en_stopwords.extend(WORDLIST_DICT['PIE_list'])
en_stopwords = {s.lower() for s in en_stopwords}

# Load the magpie training set
MAGPIE_FULL_FILE = '/home/darshan/work/course/dissertation/idiom_principle_on_magpie_corpus/experiments/exp3A_1/tmp/magpie_full_exp3A_1.csv'
df_magpie = pd.read_csv(MAGPIE_FULL_FILE)
sent_list = df_magpie['sentence_0'].values

MIN_COUNT = 1
MAX_COUNT = 15
MAGPIE_WORD_SET = set()
# Add all the unique, non-stop words to a list (exclude the single tokens as well)
word_counter = defaultdict(int)
for sent in sent_list:
    # Consider only those sentences that have the current list of PIEs
    found=False
    for pie in WORDLIST_DICT['PIE_list']:
        if pie in sent:
            found=True
            break
    if found:
        sent = sent.translate(punc_remo_trans)
        words = [word for word in sent.lower().split() if word not in en_stopwords]
        words = [word for word in words if word.isalpha() and len(word)>2 and len(word)<=12]
        # Count the word occurences
        for word in words:
            word_counter[word] += 1

# Filter out very frequent and very rare words
final_words = [word for word,count in word_counter.items() if count > MIN_COUNT and count < MAX_COUNT]
MAGPIE_WORD_SET = set(final_words)

print(f"Obtained {len(MAGPIE_WORD_SET)} words from MAGPIE corpus")

Obtained 1540 words from MAGPIE corpus


In [10]:
# print(sorted(word_counter.items(), key=lambda p: p[1]))
len(MAGPIE_WORD_SET)

1540

In [11]:
print(MAGPIE_WORD_SET)

{'simple', 'woman', 'spring', 'lived', 'radioactive', 'loss', 'bare', 'knots', 'whose', 'charles', 'diana', 'outfit', 'expenditure', 'meeting', 'luke', 'food', 'staff', 'vulcan', 'lines', 'happy', 'stopping', 'sizes', 'europe', 'enabling', 'betty', 'side', 'point', 'management', 'medal', 'committee', 'cost', 'expect', 'lexicons', 'somebody', 'drug', 'greater', 'french', 'offshore', 'study', 'always', 'king', 'northern', 'question', 'managers', 'door', 'unnecessary', 'storm', 'goods', 'easy', 'supply', 'sea', 'flood', 'tomorrow', 'waste', 'head', 'date', 'opened', 'ambitions', 'worse', 'scrapped', 'assembly', 'yes', 'need', 'stand', 'feels', 'apprentices', 'august', 'initially', 'friday', 'bus', 'house', 'displayed', 'members', 'finish', 'running', 'strokes', 'sons', 'internal', 'ground', 'spread', 'lowering', 'motto', 'act', 'ibm', 'tom', 'stops', 'investigate', 'memory', 'police', 'must', 'courts', 'duke', 'hero', 'warren', 'early', 'list', 'match', 'helping', 'commercial', 'scraps', 

In [12]:
# Output directory check
if os.path.isdir(dump_dir):
    raise Exception(f"Output directory {dump_dir} already exists!")
else:
    os.makedirs(dump_dir)

In [13]:
# Load the BERT model & tokenizers
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_dir)
# Download the Tokenizer model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_dir, use_fast=True, truncation=True)
print(f"Loaded both the LM Model & the Tokenizer models")

Some weights of the model checkpoint at ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized f

Loaded both the LM Model & the Tokenizer models


In [14]:
# Get the embedding matrix
embedding_weights = model.bert.embeddings.word_embeddings.weight
embedding_weights.shape

torch.Size([32260, 768])

# Create tokens-id mapping for all the tokens
We need to get the embeddings for all the tokens and the constituent words of the PIEs.

In [15]:
#To store PIE singel tokens
id_to_single_token_mapping = {}
# To store constituent words of a PIE
word_to_ids = {}

for i,(pie, token_str) in df_pie_token_mapping.iterrows():
    # First, get the id-token mapping for 'token_str'
    token_id = tokenizer.vocab[token_str.lower()]
    # Add to the dict
    id_to_single_token_mapping[token_id] = token_str
    
    # Next, process the individual words in the pie, find their token ids
    pie_words = [pword.strip() for pword in pie.split() if pword not in word_to_ids]
    for pword in pie_words:
        token_ids = tokenizer.encode(pword, add_special_tokens=False)
        word_to_ids[pword] = token_ids
        
print(f"Got token ids for {len(id_to_single_token_mapping)} PIE single tokens and {len(word_to_ids)} words")

Got token ids for 17 PIE single tokens and 38 words


In [16]:
# Max possible length of a line containing tab separated embeddings, a rough estimate
MAX_EMB_LINE_LENGTH = (64 + 1)*768

def get_single_token_embeddings(tok_id):
    emb_vec = embedding_weights[tok_id].detach().numpy()
    return emb_vec

def get_embedding_string(emb_vec):
    """
    # Convert one embedding vector from numpy array into a tab separated string format
    # NOTE: The float64 precision is used here!!
    """
    emb_str = np.array2string(emb_vec, separator='\t', \
                              max_line_width=MAX_EMB_LINE_LENGTH, \
                              formatter={'float_kind':lambda x: str(np.float64(x))}, \
                              suppress_small=False, floatmode='maxprec')
    # Trim [ and ] characters
    emb_str = emb_str[1:-1]
    return emb_str

## Word to Embedding string mapping
Create a map of <tok_word, embedding_str> for both pie single-tokens and the constituent words

In [17]:
word_emb_str_mapping = {}

# First process all the pie single-tokens
for tok_id, tok_word in id_to_single_token_mapping.items():
    emb_vec = get_single_token_embeddings(tok_id)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[tok_word] = emb_str
    
# Then process all the constituent words(Note: we have array of subtokens per each word!)
def get_average_embedding(token_ids):
    all_emb_vecs = []
    for tok_id in token_ids:
        emb_vec = get_single_token_embeddings(tok_id)
        all_emb_vecs.append(emb_vec)
    np_embs = np.array(all_emb_vecs)
    avg_emb = np_embs.mean(axis=0)
    return avg_emb
    
for word, tok_ids in word_to_ids.items():
    emb_vec = get_average_embedding(tok_ids)
    emb_str = get_embedding_string(emb_vec)
    word_emb_str_mapping[word] = emb_str
    
print(f"Created word-embedding string mapping for {len(word_emb_str_mapping)} tokens")

Created word-embedding string mapping for 55 tokens


### Add sample paraphrases and synonyms
Add the paraphrases and synonyms as well. The embeddings are obtained by averaging the embeddings of the subtokens.

In [18]:
COMMON_PARAPHRASES = WORDLIST_DICT['paraphrases']
for paraphrase in COMMON_PARAPHRASES:
    # Get the average embeddings for each paraphrase
    tok_ids = tokenizer.encode(paraphrase, add_special_tokens=False)
    emb_vec = get_average_embedding(tok_ids)
    para_emb_str = get_embedding_string(emb_vec)

    # Append to the mapping dictionary
    word_emb_str_mapping[paraphrase] = para_emb_str
    
print("Added additional paraphrases & synonyms of sample PIEs.")
print(f"The final word-embedding string mapping contains {len(word_emb_str_mapping)} tokens")

Added additional paraphrases & synonyms of sample PIEs.
The final word-embedding string mapping contains 86 tokens


### Add the MAGPIE words as well
Add all the words selected from MAGPIE corpus

In [19]:
for mword in MAGPIE_WORD_SET:
    # Get the average embeddings for each word
    tok_ids = tokenizer.encode(mword, add_special_tokens=False)
    emb_vec = get_average_embedding(tok_ids)
    mword_emb_str = get_embedding_string(emb_vec)

    # Append to the mapping dictionary
    word_emb_str_mapping[mword] = mword_emb_str
    
print("Added words from MAGPIE corpus!")
print(f"The final word-embedding string mapping contains {len(word_emb_str_mapping)} tokens")

Added words from MAGPIE corpus!
The final word-embedding string mapping contains 1614 tokens


## Save the tokens and embedding strings into separate files

In [20]:
# Maintain a very specific order of words and be consistent across different experiments!!!
# 1. Combine the PIEs, paraphrases, the words from WORDLIST_DICT and the MAGPIE words 
# 2. Remove duplicates!!
# 3. and sort them!!
final_word_set = set(WORDLIST_DICT['PIE_list'] + WORDLIST_DICT['paraphrases'] + WORDLIST_DICT['words'] + list(MAGPIE_WORD_SET))
SORTED_word_list = list(sorted(final_word_set))

print(f'Final number of words:{len(SORTED_word_list)}')

Final number of words:1586


In [21]:
outfile_prefix = exp_name + '_' + wordlist_set_NAME
word_file_path = os.path.join(dump_dir, outfile_prefix + '_words.tsv')
embedding_file_path = os.path.join(dump_dir, outfile_prefix + '_vectors.tsv')

# For every token in the additional tokens, get the embedding vector
# Append the token to word_file, append the tab-separated emebeddings to embeddings_file
with open(word_file_path, 'a') as word_file:
    with open(embedding_file_path, 'a') as embedding_file:
        # Write to the file in the exact same order!!
        for tok_word in SORTED_word_list:
            emb_str = word_emb_str_mapping[tok_word]
            # Save tok_word
            word_file.write(tok_word + '\n')
            # Save emb_str
            embedding_file.write(emb_str + '\n')

print(f'Saved word and embedding TSV files at {dump_dir}')

Saved word and embedding TSV files at ./embedding_dump/
