In [1]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Inserting slang into BERT

In [2]:
TORCH_SEED = 69
torch.manual_seed(TORCH_SEED)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=False)

In [3]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")
    return embeddings_index

file_path = 'Embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
glove_embeddings = load_glove_embeddings(file_path)
GLOVE_EMBEDDING_SIZE = 200

Loaded 1193514 word vectors from GloVe.


In [4]:
slang_df = pd.read_csv('Vocabulary/slang_words_final.csv', header=None, names=['slang'])
slang_list = slang_df['slang'].tolist()
slang_list

['lol',
 'im',
 'oh',
 'u',
 'bio',
 'wow',
 'omg',
 'ur',
 'ya',
 'pm',
 'pic',
 'dm',
 'btw',
 'gt',
 'bbq',
 'app',
 'lmao',
 'xd',
 'k',
 'ppl',
 'wtf',
 'rip',
 'bday',
 'idk',
 'thx',
 'sis',
 'vip',
 'bb',
 'ooh',
 'plz',
 'sir',
 'yr',
 'fml',
 'fam',
 'bf',
 'peeps',
 'jk',
 'wat',
 'pls',
 'atm',
 'asap',
 'smh',
 'aka',
 'bff',
 'fab',
 'ily',
 'gf',
 'mid',
 'meh',
 'abt',
 'lmfao',
 'ty',
 'ff',
 'fyi',
 'tht',
 'sux',
 'goat',
 'ftw',
 'lit',
 'probs',
 'cap',
 'piss',
 'prolly',
 'msg',
 'tbh',
 'thnx',
 'cam',
 'nc',
 'omfg',
 'gd',
 'rofl',
 'gal',
 'wth',
 'dis',
 'tgif',
 'nw',
 'sry',
 'admin',
 'imo',
 'mc',
 'gud',
 'op',
 'hv',
 'bak',
 'ratio',
 'drip',
 'wah',
 'gg',
 'ig',
 'veg',
 'wrk',
 'eva',
 'bot',
 'ttyl',
 'fu',
 'mb',
 'nt',
 'fr',
 'inc',
 'rly',
 'np',
 'ss',
 'ot',
 'brb',
 'tu',
 'smash',
 'srsly',
 'mod',
 'pos',
 'chad',
 'af',
 'kk',
 'gna',
 'sup',
 'gnite',
 'gnight',
 'wut',
 'sheesh',
 'yt',
 'cya',
 'swag',
 'kia',
 'ffs',
 'xoxoxo',
 'ly'

In [5]:
embedding_layer = model.embeddings.word_embeddings

slang_projection_layer = torch.nn.Linear(GLOVE_EMBEDDING_SIZE, model.config.hidden_size)

new_slang_tokens = [token for token in slang_list if token not in tokenizer.get_vocab()]
tokenizer.add_tokens(new_slang_tokens)

model.resize_token_embeddings(len(tokenizer))

for slang in new_slang_tokens:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    
    if slang in glove_embeddings:
        glove_vector = torch.tensor(glove_embeddings[slang], dtype=torch.float32)
        projected_vector = slang_projection_layer(glove_vector.unsqueeze(0)).squeeze(0)
        embedding_layer.weight.data[token_index] = projected_vector
        print(f"Initialized '{slang}' with projected GloVe weights.")
    else:
        embedding_layer.weight.data[token_index] = torch.randn(model.config.hidden_size)
        print(f"Initialized '{slang}' with random weights.")


Initialized 'lol' with projected GloVe weights.
Initialized 'omg' with projected GloVe weights.
Initialized 'dm' with projected GloVe weights.
Initialized 'btw' with projected GloVe weights.
Initialized 'bbq' with projected GloVe weights.
Initialized 'lmao' with projected GloVe weights.
Initialized 'xd' with projected GloVe weights.
Initialized 'ppl' with projected GloVe weights.
Initialized 'wtf' with projected GloVe weights.
Initialized 'bday' with projected GloVe weights.
Initialized 'idk' with projected GloVe weights.
Initialized 'thx' with projected GloVe weights.
Initialized 'ooh' with projected GloVe weights.
Initialized 'plz' with projected GloVe weights.
Initialized 'yr' with projected GloVe weights.
Initialized 'fml' with projected GloVe weights.
Initialized 'fam' with projected GloVe weights.
Initialized 'peeps' with projected GloVe weights.
Initialized 'jk' with projected GloVe weights.
Initialized 'pls' with projected GloVe weights.
Initialized 'asap' with projected GloVe 

In [6]:
for slang in new_slang_tokens:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if token_index != tokenizer.unk_token_id:
        print(f"'{slang}' successfully added to the tokenizer with index {token_index}.")
    else:
        print(f"'{slang}' was not added to the tokenizer.")


'lol' successfully added to the tokenizer with index 30522.
'omg' successfully added to the tokenizer with index 30523.
'dm' successfully added to the tokenizer with index 30524.
'btw' successfully added to the tokenizer with index 30525.
'bbq' successfully added to the tokenizer with index 30526.
'lmao' successfully added to the tokenizer with index 30527.
'xd' successfully added to the tokenizer with index 30528.
'ppl' successfully added to the tokenizer with index 30529.
'wtf' successfully added to the tokenizer with index 30530.
'bday' successfully added to the tokenizer with index 30531.
'idk' successfully added to the tokenizer with index 30532.
'thx' successfully added to the tokenizer with index 30533.
'ooh' successfully added to the tokenizer with index 30534.
'plz' successfully added to the tokenizer with index 30535.
'yr' successfully added to the tokenizer with index 30536.
'fml' successfully added to the tokenizer with index 30537.
'fam' successfully added to the tokenizer

In [7]:
for slang in new_slang_tokens:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if token_index != tokenizer.unk_token_id:
        embedding_vector = embedding_layer.weight.data[token_index]
        print(f"Embedding for '{slang}': {embedding_vector[:5]}")

Embedding for 'lol': tensor([-0.3706, -0.1505, -0.7883,  0.1622,  0.0539])
Embedding for 'omg': tensor([-0.1702, -0.0533, -0.7506,  0.0376,  0.2117])
Embedding for 'dm': tensor([-0.0391, -0.2768, -0.3481,  0.1401,  0.3842])
Embedding for 'btw': tensor([-0.1023, -0.0513, -0.6508, -0.0574, -0.0829])
Embedding for 'bbq': tensor([-0.4082,  0.2432, -0.3136,  0.0752,  0.1574])
Embedding for 'lmao': tensor([-0.4083, -0.1617, -0.7171,  0.1970,  0.0265])
Embedding for 'xd': tensor([-0.1308, -0.0710, -0.8332,  0.1608,  0.3912])
Embedding for 'ppl': tensor([ 0.0139, -0.2509, -0.3698, -0.1251, -0.2624])
Embedding for 'wtf': tensor([-0.0329, -0.1454, -0.5358,  0.3138,  0.2205])
Embedding for 'bday': tensor([-0.2101,  0.3776, -0.9609, -0.2693, -0.0524])
Embedding for 'idk': tensor([-0.0946, -0.1980, -0.8532,  0.0479,  0.0510])
Embedding for 'thx': tensor([-0.1844, -0.4793, -0.5281, -0.1230, -0.1984])
Embedding for 'ooh': tensor([-0.0421, -0.3134, -0.5394,  0.1425,  0.2673])
Embedding for 'plz': tens

# Inserting emoji in BERT

In [13]:
from gensim.models import keyedvectors

e2v = keyedvectors.load_word2vec_format('Embeddings/emoji2vec.bin', binary=True)

In [14]:
print(f"Dimension of e2v embedding:", e2v[0].shape[0])

Dimension of e2v embedding: 300


In [15]:
E2V_EMBEDDING_SIZE = 300

In [16]:
emoji_df = pd.read_csv('Vocabulary/emoji_scores.csv', header=None, names=['emoji', 'score'])
emoji_list = emoji_df['emoji'].tolist()[:200]
emoji_list

['❤',
 '✅',
 '😷',
 '👉',
 '✨',
 '💉',
 '✔',
 '➡',
 '🙏',
 '📍',
 '🦠',
 '👇',
 '📞',
 '⚠',
 '🌟',
 '😂',
 '💙',
 '😊',
 '🌐',
 '📸',
 '📲',
 '💪',
 '💕',
 '▶',
 '🔗',
 '🔥',
 '☎',
 '™',
 '👍',
 '🎉',
 '😍',
 '⭐',
 '🚨',
 '💯',
 '🙌',
 '⚕',
 '‼',
 '💚',
 '🥰',
 '🙏🏻',
 '❗',
 '💛',
 '♥',
 '🤣',
 '💜',
 '♀',
 '🩺',
 '🔹',
 '📱',
 '📢',
 '👏',
 '💻',
 '📧',
 '📷',
 '☀',
 '🏥',
 '🤝',
 '▪',
 '✈',
 '👩',
 '🤍',
 '😉',
 '👀',
 '😁',
 '🌍',
 '📊',
 '🤩',
 '🤔',
 '😎',
 '🤗',
 '🔸',
 '🥳',
 '🧡',
 '🌈',
 '💫',
 '👉🏻',
 '📣',
 '😃',
 '🌎',
 '⬇',
 '🧪',
 '✌',
 '🗣',
 '💖',
 '🙂',
 '♂',
 '📰',
 '💥',
 '♦',
 '👆',
 '👨',
 '®',
 '📌',
 '🙏🏼',
 '☺',
 '🤑',
 '💬',
 '💰',
 '😇',
 '🖤',
 '🌺',
 '📚',
 '🤒',
 '⚡',
 '🔔',
 '🏡',
 '💡',
 '🌿',
 '😅',
 '🤧',
 '🌸',
 '💗',
 '🙏🏽',
 '👈',
 '🔬',
 '🏠',
 '😭',
 '😀',
 '🎥',
 '⬆',
 '🔷',
 '👥',
 '🎃',
 '💊',
 '😘',
 '☝🏻',
 '👣',
 '☝',
 '🌱',
 '🔴',
 '😄',
 '🎄',
 '➖',
 '💌',
 '⏰',
 '📈',
 '🎨',
 '😱',
 '🚀',
 '🏆',
 '🔁',
 '🍁',
 '☑',
 '📩',
 '💔',
 '👇🏻',
 '😢',
 '🗓',
 '🌞',
 '👋',
 '🙌🏻',
 '🎇',
 '🩸',
 '🦟',
 '💃',
 '👹',
 '👌',
 '💧',
 '🚫',
 '❌',
 '🙌🏽',
 '😔',
 '🛡',
 '🐾',
 '🙄',
 '

In [17]:
new_emoji_tokens = [emoji for emoji in emoji_list if emoji not in tokenizer.get_vocab()]
tokenizer.add_tokens(new_emoji_tokens)

model.resize_token_embeddings(len(tokenizer))

emoji_projection_layer = torch.nn.Linear(E2V_EMBEDDING_SIZE, model.config.hidden_size)

for emoji in new_emoji_tokens:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    
    if emoji in e2v:
        e2v_vector = torch.tensor(e2v[emoji], dtype=torch.float32)
        projected_vector = emoji_projection_layer(e2v_vector.unsqueeze(0)).squeeze(0)
        embedding_layer.weight.data[token_index] = projected_vector
        print(f"Initialized '{emoji}' with projected e2v weights.")
    else:
        embedding_layer.weight.data[token_index] = torch.randn(model.config.hidden_size)
        print(f"Initialized '{emoji}' with random weights.")

Initialized '❤' with random weights.
Initialized '✅' with projected e2v weights.
Initialized '😷' with projected e2v weights.
Initialized '👉' with projected e2v weights.
Initialized '✨' with projected e2v weights.
Initialized '💉' with projected e2v weights.
Initialized '✔' with random weights.
Initialized '➡' with random weights.
Initialized '🙏' with projected e2v weights.
Initialized '📍' with projected e2v weights.
Initialized '🦠' with random weights.
Initialized '👇' with projected e2v weights.
Initialized '📞' with projected e2v weights.
Initialized '⚠' with random weights.
Initialized '🌟' with projected e2v weights.
Initialized '😂' with projected e2v weights.
Initialized '💙' with projected e2v weights.
Initialized '😊' with projected e2v weights.
Initialized '🌐' with projected e2v weights.
Initialized '📸' with projected e2v weights.
Initialized '📲' with projected e2v weights.
Initialized '💪' with projected e2v weights.
Initialized '💕' with projected e2v weights.
Initialized '▶' with ra

In [18]:
for emoji in emoji_list:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    if token_index != tokenizer.unk_token_id:
        print(f"'{emoji}' successfully added to the tokenizer with index {token_index}.")
    else:
        print(f"'{emoji}' was not added to the tokenizer.")


'❤' successfully added to the tokenizer with index 30653.
'✅' successfully added to the tokenizer with index 30654.
'😷' successfully added to the tokenizer with index 30655.
'👉' successfully added to the tokenizer with index 30656.
'✨' successfully added to the tokenizer with index 30657.
'💉' successfully added to the tokenizer with index 30658.
'✔' successfully added to the tokenizer with index 30659.
'➡' successfully added to the tokenizer with index 30660.
'🙏' successfully added to the tokenizer with index 30661.
'📍' successfully added to the tokenizer with index 30662.
'🦠' successfully added to the tokenizer with index 30663.
'👇' successfully added to the tokenizer with index 30664.
'📞' successfully added to the tokenizer with index 30665.
'⚠' successfully added to the tokenizer with index 30666.
'🌟' successfully added to the tokenizer with index 30667.
'😂' successfully added to the tokenizer with index 30668.
'💙' successfully added to the tokenizer with index 30669.
'😊' successful

In [19]:
for emoji in emoji_list:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    embedding_vector = embedding_layer.weight.data[token_index]
    print(f"Embedding for '{emoji}': {embedding_vector[:5]}")

Embedding for '❤': tensor([-1.9318,  1.3915,  0.2706,  0.6012, -1.2936])
Embedding for '✅': tensor([ 0.0045, -0.0347,  0.0493, -0.0238, -0.0845])
Embedding for '😷': tensor([ 0.0257, -0.0700,  0.0110,  0.0366, -0.0962])
Embedding for '👉': tensor([ 0.0352, -0.0107,  0.0786, -0.0365, -0.0146])
Embedding for '✨': tensor([ 0.0493, -0.0641,  0.0667,  0.0196, -0.0253])
Embedding for '💉': tensor([ 0.0545, -0.0260,  0.0555,  0.0024, -0.0931])
Embedding for '✔': tensor([ 0.5333, -1.5993,  1.1099, -0.5623,  0.2066])
Embedding for '➡': tensor([ 1.9998,  0.5251, -0.9919,  0.8025, -0.9525])
Embedding for '🙏': tensor([ 0.0116, -0.0413,  0.0525, -0.0464, -0.1010])
Embedding for '📍': tensor([ 0.0313, -0.0385,  0.0822, -0.0313, -0.0587])
Embedding for '🦠': tensor([-0.1094,  0.3395, -0.6321, -1.3158, -0.6772])
Embedding for '👇': tensor([ 0.0509, -0.0328,  0.0651, -0.0666,  0.0037])
Embedding for '📞': tensor([ 0.0410, -0.0845,  0.0979, -0.0110, -0.0151])
Embedding for '⚠': tensor([-0.8477, -0.6070, -0.028

# Save model and tokenizer

In [20]:
model_save_path = "./expanded_vocab_bert"

model.save_pretrained(model_save_path)

tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to ./expanded_vocab_bert


# Load the model and test

In [21]:
newTokenizer = BertTokenizer.from_pretrained(model_save_path)
newModel = BertModel.from_pretrained(model_save_path)

In [22]:
test_slang = "gratz"
test_emoji = "✅"

slang_id = tokenizer.convert_tokens_to_ids(test_slang)
emoji_id = tokenizer.convert_tokens_to_ids(test_emoji)

print(f"Token ID for slang '{test_slang}': {slang_id}")
print(f"Token ID for emoji '{test_emoji}': {emoji_id}")

if slang_id == tokenizer.unk_token_id:
    print(f"Slang '{test_slang}' is not in the vocabulary.")
else:
    print(f"Slang '{test_slang}' is in the vocabulary.")

if emoji_id == tokenizer.unk_token_id:
    print(f"Emoji '{test_emoji}' is not in the vocabulary.")
else:
    print(f"Emoji '{test_emoji}' is in the vocabulary.")


Token ID for slang 'gratz': 30651
Token ID for emoji '✅': 30654
Slang 'gratz' is in the vocabulary.
Emoji '✅' is in the vocabulary.


In [23]:
embedding_layer = newModel.embeddings.word_embeddings

if slang_id != tokenizer.unk_token_id:
    slang_embedding = embedding_layer.weight.data[slang_id]
    print(f"Embedding for slang '{test_slang}': {slang_embedding}")

if emoji_id != tokenizer.unk_token_id:
    emoji_embedding = embedding_layer.weight.data[emoji_id]
    print(f"Embedding for emoji '{test_emoji}': {emoji_embedding}")

Embedding for slang 'gratz': tensor([ 3.0528e-02,  2.0413e-01, -1.8689e-01, -9.5478e-02, -1.3664e-01,
        -3.6737e-02, -9.7823e-02, -3.1699e-02,  1.4831e-01,  1.4878e-01,
         3.5912e-01,  1.9500e-01, -2.4533e-01, -1.5895e-01, -5.7964e-02,
         1.8909e-01,  1.9251e-01,  1.6659e-01,  1.7418e-01,  1.5912e-01,
         7.7693e-02,  3.1886e-02, -3.2958e-01,  2.7532e-01, -2.1682e-01,
        -4.7799e-01, -8.9799e-02, -1.4757e-02,  2.9868e-01, -1.9759e-01,
         3.6164e-02,  2.4430e-01, -1.3531e-02,  3.5472e-01, -2.6867e-01,
         2.9537e-02, -1.0596e-01,  1.7264e-01, -1.2466e-01,  6.0011e-03,
         6.5716e-02,  1.0858e-01, -4.1726e-01,  3.5357e-01, -2.0372e-01,
        -2.3939e-01, -7.8870e-02,  1.0573e-01,  4.7009e-01,  2.1477e-04,
         2.4618e-01,  3.2215e-01,  9.5944e-02, -2.0055e-01, -9.6634e-02,
         1.9768e-01, -4.2679e-02,  1.2662e-01,  1.7296e-01,  3.2968e-01,
         1.2838e-01, -2.0800e-01,  2.9591e-02,  1.6847e-01, -1.1582e-01,
        -2.6298e-03, -