In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('Datasets/Cleaned with tokens/combined_dataset.csv')
data

Unnamed: 0,text,polarity,source
0,"- awww, that's a bummer. you shoulda got davi...",0,Twitter
1,is upset that he can't update his facebook by ...,0,Twitter
2,i dived many times for the ball. managed to sa...,0,Twitter
3,my whole body feels itchy and like its on fire,0,Twitter
4,"no, it's not behaving at all. i'm mad. why am ...",0,Twitter
...,...,...,...
2196816,"best viet hoagies you'll find in the area, or ...",1,Yelp
2196817,"if you need medical testing of any kind, i wou...",1,Yelp
2196818,this place is a dream. honestly my favorite in...,1,Yelp
2196819,great place to have your dog groom. my one dog...,1,Yelp


# Slang dataset and TFIDF extraction

https://huggingface.co/datasets/MLBtrio/genz-slang-dataset/viewer/default/train

In [None]:
from datasets import load_dataset

slang_dataset = load_dataset("MLBtrio/genz-slang-dataset")
slang_words = [entry['Slang'] for entry in slang_dataset['train'] if ' ' not in entry['Slang'] and entry['Slang'].isalpha()]
slang_words_set = set(word.lower() for word in slang_words)
slang_words_set

In [6]:
def slang_tokenizer(text):
    tokens = word_tokenize(text.lower(), language='english', preserve_line=True)
    return [word for word in tokens if word.isalpha() and word in slang_words_set]

In [7]:
vectorizer = TfidfVectorizer(tokenizer=slang_tokenizer, lowercase=True)

In [None]:
tfidf_matrix = vectorizer.fit_transform(data['text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.index = data.index

slang_scores = tfidf_df.sum(axis=0).sort_values(ascending=False)
slang_scores.to_csv('Vocabulary/slang_scores.csv')

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Testing tokenizer detection of slang words
words = ["u", "im", "wat", 'lmao', 'lol', 'brb', 'omg', 'wtf', 'smh', 'idk', 'tbh', 'sry']
for word in words:
    token_id = tokenizer.convert_tokens_to_ids(word)
    if token_id == 100:  # ID 100 corresponds to the [UNK] token
        print(f"'{word}' is NOT in the BERT vocabulary (mapped to [UNK]).")
    else:
        print(f"'{word}' is in the BERT vocabulary with ID {token_id}.")

# Check for slang score distribution

In [None]:
slang_score_distribution = pd.read_csv('Vocabulary/slang_scores.csv', header=None, names=['Slang', 'Score'])
slang_score_distribution

Unnamed: 0,Slang,Score
0,was,217339.246501
1,so,165827.902975
2,at,165670.348117
3,are,156590.391256
4,we,143283.229747
...,...,...
927,nrn,0.642039
928,iaaa,0.632667
929,mirl,0.600004
930,fbm,0.572891


In [10]:
print('Slang scores above 0: ' + str(slang_score_distribution[slang_score_distribution['Score'] > 0]['Score'].count()))
print('Slang scores above 50: ' + str(slang_score_distribution[slang_score_distribution['Score'] > 50]['Score'].count()))
print('Slang scores above 100: ' + str(slang_score_distribution[slang_score_distribution['Score'] > 100]['Score'].count()))
print('Slang scores above 1000: ' + str(slang_score_distribution[slang_score_distribution['Score'] > 1000]['Score'].count()))
print('Slang scores above 10000: ' + str(slang_score_distribution[slang_score_distribution['Score'] > 10000]['Score'].count()))


Slang scores above 0: 932
Slang scores above 50: 428
Slang scores above 100: 353
Slang scores above 1000: 108
Slang scores above 10000: 15


# Check for example usage of the slang

In [3]:
slang_samples = pd.read_csv('Vocabulary/slang_scores_sample.csv', header=0, names=['Slang', 'TF-IDF'])
slang_samples

Unnamed: 0,Slang,TF-IDF
0,im,41467.735488
1,oh,35708.223269
2,u,34531.078095
3,bio,13142.782625
4,wow,11133.654649
...,...,...
224,pov,46.223900
225,bcos,42.637456
226,gtfo,41.713035
227,gratz,34.687383


In [4]:
def find_sentences_with_slang(df, slang, max_samples=3):
    matching_sentences = []
    for sentence in df["text"]:
        tokens = word_tokenize(sentence.lower())
        if slang in tokens:
            matching_sentences.append(sentence)
        if len(matching_sentences) >= max_samples:
            break
    return matching_sentences

results = {}

column_list = slang_samples['Slang'].tolist()

for slang in column_list:
    sentences = find_sentences_with_slang(data, slang, max_samples=3)
    if sentences:
        results[slang] = sentences

if results:
    results_df = pd.DataFrame([
        {"Slang": slang, "Sentence": sentence}
        for slang, sentences in results.items()
        for sentence in sentences
    ])
else:
    print("No slang terms found in the dataset.")

results_df.to_csv('slang_sample_examples.csv', index=False)

In [18]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
results_df

# Emoji TFIDF extraction

In [27]:
import emoji

emoji_df = data.copy()

def extract_emojis(text):
    return " ".join([e['emoji'] for e in emoji.emoji_list(text)])

emoji_df["emojis"] = data["text"].apply(extract_emojis)
emoji_df["emojis"] = emoji_df["emojis"].replace("", "none")

In [28]:
emoji_df

Unnamed: 0,text,polarity,source,emojis
0,"- awww, that's a bummer. you shoulda got davi...",0,Twitter,none
1,is upset that he can't update his facebook by ...,0,Twitter,none
2,i dived many times for the ball. managed to sa...,0,Twitter,none
3,my whole body feels itchy and like its on fire,0,Twitter,none
4,"no, it's not behaving at all. i'm mad. why am ...",0,Twitter,none
...,...,...,...,...
2196816,"best viet hoagies you'll find in the area, or ...",1,Yelp,none
2196817,"if you need medical testing of any kind, i wou...",1,Yelp,none
2196818,this place is a dream. honestly my favorite in...,1,Yelp,none
2196819,great place to have your dog groom. my one dog...,1,Yelp,none


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=False,
    token_pattern=r"[^\s]+"  # Match any non-whitespace characters (e.g., emojis)
)

tfidf_matrix = vectorizer.fit_transform(emoji_df["emojis"])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.index = emoji_df.index

emoji_scores = tfidf_df.sum(axis=0).sort_values(ascending=False)

emoji_scores.to_csv("Vocabulary/emoji_scores.csv")

In [6]:
emoji_scores_distribution = pd.read_csv('Vocabulary/emoji_scores.csv', header=None, names=['Emoji', 'Score'])
emoji_scores_distribution

Unnamed: 0,Emoji,Score
0,❤,4378.094343
1,✅,4073.393400
2,😷,2764.420312
3,👉,2686.821333
4,✨,2266.170048
...,...,...
1725,🏄🏾,0.109644
1726,💱,0.095398
1727,👧🏿,0.093676
1728,🏊🏾,0.038994


In [4]:
print('Emoji scores above 0: ' + str(emoji_scores_distribution[emoji_scores_distribution['Score'] > 0]['Score'].count()))
print('Emoji scores above 50: ' + str(emoji_scores_distribution[emoji_scores_distribution['Score'] > 50]['Score'].count()))
print('Emoji scores above 100: ' + str(emoji_scores_distribution[emoji_scores_distribution['Score'] > 100]['Score'].count()))
print('Emoji scores above 1000: ' + str(emoji_scores_distribution[emoji_scores_distribution['Score'] > 1000]['Score'].count()))
print('Emoji scores above 10000: ' + str(emoji_scores_distribution[emoji_scores_distribution['Score'] > 10000]['Score'].count()))

Emoji scores above 0: 1730
Emoji scores above 50: 348
Emoji scores above 100: 228
Emoji scores above 1000: 16
Emoji scores above 10000: 0
