In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

# Slang dataset

https://huggingface.co/datasets/MLBtrio/genz-slang-dataset/viewer/default/train

In [None]:
from datasets import load_dataset

slang_dataset = load_dataset("MLBtrio/genz-slang-dataset")
slang_words = [entry['Slang'] for entry in slang_dataset['train'] if ' ' not in entry['Slang'] and entry['Slang'].isalpha()]
slang_words_set = set(word.lower() for word in slang_words)
slang_words_set

In [5]:
data = pd.read_csv('Datasets/Cleaned with tokens/combined_dataset.csv')
data

Unnamed: 0,text,polarity,source
0,"- awww, that's a bummer. you shoulda got davi...",0,Twitter
1,is upset that he can't update his facebook by ...,0,Twitter
2,i dived many times for the ball. managed to sa...,0,Twitter
3,my whole body feels itchy and like its on fire,0,Twitter
4,"no, it's not behaving at all. i'm mad. why am ...",0,Twitter
...,...,...,...
2196816,"best viet hoagies you'll find in the area, or ...",1,Yelp
2196817,"if you need medical testing of any kind, i wou...",1,Yelp
2196818,this place is a dream. honestly my favorite in...,1,Yelp
2196819,great place to have your dog groom. my one dog...,1,Yelp


In [6]:
def slang_tokenizer(text):
    tokens = word_tokenize(text.lower(), language='english', preserve_line=True)
    return [word for word in tokens if word.isalpha() and word in slang_words_set]

In [7]:
vectorizer = TfidfVectorizer(tokenizer=slang_tokenizer, lowercase=True)

In [None]:
tfidf_matrix = vectorizer.fit_transform(data['text'])

# Create a DataFrame to display results
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.index = data.index

# Aggregate TF-IDF scores for each slang term
slang_scores = tfidf_df.sum(axis=0).sort_values(ascending=False)
slang_scores.to_csv('slang_scores.csv')

In [None]:
from transformers import BertTokenizer

# Test the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

words = ["u", "im", "wat", 'lmao', 'lol', 'brb', 'omg', 'wtf', 'smh', 'idk', 'tbh', 'sry']
for word in words:
    token_id = tokenizer.convert_tokens_to_ids(word)
    if token_id == 100:  # ID 100 corresponds to the [UNK] token
        print(f"'{word}' is NOT in the BERT vocabulary (mapped to [UNK]).")
    else:
        print(f"'{word}' is in the BERT vocabulary with ID {token_id}.")

# Check for example usage of the slang

# Inserting slang into BERT

In [65]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

TORCH_SEED = 42
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
torch.manual_seed(TORCH_SEED)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<torch._C.Generator at 0x1f4322c2d90>

In [38]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")
    return embeddings_index

file_path = 'Embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
glove_embeddings = load_glove_embeddings(file_path)
GLOVE_EMBEDDING_SIZE = 200

Loaded 1193514 word vectors from GloVe.


In [55]:
slang_words = ["lol", "lmao", "omg", "wtf"]

In [66]:
embedding_layer = model.bert.embeddings.word_embeddings

projection_layer = torch.nn.Linear(GLOVE_EMBEDDING_SIZE, model.config.hidden_size)

new_tokens = [token for token in slang_words if token not in tokenizer.get_vocab()]
tokenizer.add_tokens(new_tokens)

model.resize_token_embeddings(len(tokenizer))

embedding_layer = model.bert.embeddings.word_embeddings

for slang in new_tokens:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    
    if slang in glove_embeddings:
        glove_vector = torch.tensor(glove_embeddings[slang], dtype=torch.float32)
        projected_vector = projection_layer(glove_vector.unsqueeze(0)).squeeze(0)
        embedding_layer.weight.data[token_index] = projected_vector
        print(f"Initialized '{slang}' with projected GloVe weights.")
    else:
        embedding_layer.weight.data[token_index] = torch.randn(model.config.hidden_size)
        print(f"Initialized '{slang}' with random weights.")


Initialized 'lol' with projected GloVe weights.
Initialized 'lmao' with projected GloVe weights.
Initialized 'omg' with projected GloVe weights.
Initialized 'wtf' with projected GloVe weights.


In [57]:
for slang in slang_words:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if token_index != tokenizer.unk_token_id:
        print(f"'{slang}' successfully added to the tokenizer with index {token_index}.")
    else:
        print(f"'{slang}' was not added to the tokenizer.")


'lol' successfully added to the tokenizer with index 30522.
'lmao' successfully added to the tokenizer with index 30523.
'omg' successfully added to the tokenizer with index 30524.
'wtf' successfully added to the tokenizer with index 30525.


In [58]:
for slang in slang_words:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if token_index != tokenizer.unk_token_id:
        embedding_vector = embedding_layer.weight.data[token_index]
        print(f"Embedding for '{slang}': {embedding_vector[:5]}")

Embedding for 'lol': tensor([-0.0323, -0.4869, -0.0764,  0.5014, -0.4434])
Embedding for 'lmao': tensor([-0.1084, -0.4460, -0.1522,  0.3065, -0.4368])
Embedding for 'omg': tensor([ 0.0654, -0.4869,  0.0401,  0.2163, -0.0247])
Embedding for 'wtf': tensor([ 0.0321, -0.6249, -0.0311,  0.2451, -0.1961])


In [59]:
for slang in slang_words:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if slang in glove_embeddings and token_index != tokenizer.unk_token_id:
        glove_vector = torch.tensor(glove_embeddings[slang], dtype=torch.float32)
        projected_vector = projection_layer(glove_vector.unsqueeze(0)).squeeze(0)
        bert_embedding = embedding_layer.weight.data[token_index]
        is_close = torch.allclose(bert_embedding, projected_vector, atol=1e-3)
        print(f"'{slang}': Initialized with GloVe? {'Yes' if is_close else 'No'}")

for slang in slang_words:
    token_index = tokenizer.convert_tokens_to_ids(slang)
    if slang not in glove_embeddings and token_index != tokenizer.unk_token_id:
        embedding_vector = embedding_layer.weight.data[token_index]
        print(f"'{slang}' was randomly initialized with values: {embedding_vector[:5]}")

'lol': Initialized with GloVe? Yes
'lmao': Initialized with GloVe? Yes
'omg': Initialized with GloVe? Yes
'wtf': Initialized with GloVe? Yes


# Inserting emoji in BERT

In [44]:
from gensim.models import keyedvectors

e2v = keyedvectors.load_word2vec_format('Embeddings/emoji2vec.bin', binary=True)

In [45]:
print(f"Dimension of e2v embedding:", e2v[0].shape[0])

Dimension of e2v embedding: 300


In [46]:
E2V_EMBEDDING_SIZE = 300

In [60]:
emoji_list = ["😀", "😂", "❤️", "🔥", "👍", "😭", "🙏", "🥺", "😍", "😅"]

In [67]:
new_emoji_tokens = [emoji for emoji in emoji_list if emoji not in tokenizer.get_vocab()]
tokenizer.add_tokens(new_emoji_tokens)

model.resize_token_embeddings(len(tokenizer))

embedding_layer = model.bert.embeddings.word_embeddings

emoji_projection_layer = torch.nn.Linear(E2V_EMBEDDING_SIZE, model.config.hidden_size)

for emoji in new_emoji_tokens:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    
    if emoji in e2v:
        e2v_vector = torch.tensor(e2v[emoji], dtype=torch.float32)
        projected_vector = emoji_projection_layer(e2v_vector.unsqueeze(0)).squeeze(0)
        embedding_layer.weight.data[token_index] = projected_vector
        print(f"Initialized '{emoji}' with projected e2v weights.")
    else:
        embedding_layer.weight.data[token_index] = torch.randn(model.config.hidden_size)
        print(f"Initialized '{emoji}' with random weights.")

Initialized '😀' with projected e2v weights.
Initialized '😂' with projected e2v weights.
Initialized '❤️' with projected e2v weights.
Initialized '🔥' with projected e2v weights.
Initialized '👍' with projected e2v weights.
Initialized '😭' with projected e2v weights.
Initialized '🙏' with projected e2v weights.
Initialized '🥺' with random weights.
Initialized '😍' with projected e2v weights.
Initialized '😅' with projected e2v weights.


In [49]:
for emoji in emoji_list:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    if emoji in e2v.key_to_index:
        print(f"'{emoji}' initialized with Emoji2Vec weights.")
    else:
        print(f"'{emoji}' initialized with random weights.")

'😀' initialized with Emoji2Vec weights.
'😂' initialized with Emoji2Vec weights.
'❤️' initialized with Emoji2Vec weights.
'🔥' initialized with Emoji2Vec weights.
'👍' initialized with Emoji2Vec weights.
'😭' initialized with Emoji2Vec weights.
'🙏' initialized with Emoji2Vec weights.
'🥺' initialized with random weights.
'😍' initialized with Emoji2Vec weights.
'😅' initialized with Emoji2Vec weights.


In [68]:
for emoji in emoji_list:
    token_index = tokenizer.convert_tokens_to_ids(emoji)
    embedding_vector = embedding_layer.weight.data[token_index]
    print(f"Embedding for '{emoji}': {embedding_vector[:5]}")

Embedding for '😀': tensor([ 0.0116, -0.0169,  0.0395, -0.0076,  0.0237])
Embedding for '😂': tensor([ 0.0213,  0.0384,  0.0585, -0.0085, -0.0127])
Embedding for '❤️': tensor([ 0.0173,  0.0480,  0.0604, -0.0098,  0.0586])
Embedding for '🔥': tensor([ 0.0571,  0.0239,  0.0397, -0.0350,  0.0097])
Embedding for '👍': tensor([ 0.0179,  0.0478,  0.0260, -0.0198,  0.0233])
Embedding for '😭': tensor([ 0.0044,  0.0139,  0.0420, -0.0062, -0.0175])
Embedding for '🙏': tensor([-0.0031,  0.0013,  0.0230, -0.0215, -0.0086])
Embedding for '🥺': tensor([-0.0629,  1.5145,  0.4056,  0.2759,  1.2953])
Embedding for '😍': tensor([ 0.0656,  0.0093,  0.0686, -0.0363,  0.0171])
Embedding for '😅': tensor([-0.0297,  0.0415,  0.0760, -0.0487, -0.0274])
