In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [15]:
emote_df = pd.read_csv("../datasets/train/train_emoticon.csv")
emote_df

# 1. split the string into individual characters
# 2. run a correlation matrix on each category
# check number of distinct emotes

# Split the string into individual characters
emote_df['characters'] = emote_df['input_emoticon'].apply(list)

# Create new columns for each character
for i in range(len(emote_df['characters'][0])):
    emote_df[f'c_{i+1}'] = emote_df['characters'].apply(lambda x: x[i] if len(x) > i else '')
l1 = [f'c_{i+1}' for i in range(len(emote_df['characters'][0]))]
edf = emote_df[l1].apply(lambda x: [ord(i) for i in x])
edf.insert(13, 'label', emote_df['label'])
emote_df[['input_emoticon', 'label']].head(25)

Unnamed: 0,input_emoticon,label
0,😛🛐😻😑😣🙠🙯🚼😒🙼😑🙯😣,0
1,🛐😑😪😛🚼🙯😣🚅😑🙯😹😣🙼,0
2,😛🙯😑🚡😣🚼🛐🙲😣🙯🛑😑🙼,0
3,😛🚼🛐🙐😣🙯😑🙪😑🙼🛆😣🙯,1
4,🛐🚟🚼😛🙋😑😣🙯😹🙯😑😣🙼,1
5,😑😣🚧😛🚜🚼🙯🛐🙼😣😑🙕🙯,1
6,😣😑🙯🚼🛐🚥😬😛😣🚄😑🙼🙯,0
7,🚡🚼😑🛐🚔🙯😛😣😑🙯🛓🙼😣,0
8,🛐😛🛜😑🚼😚😣🙯😣😑🙯🚠🙼,0
9,🙯😑🙷🛐🚼😣😛😍😿🙯🙼😑😣,1


In [16]:
emo = emote_df['input_emoticon'].to_list()

In [17]:
import demoji

# Ensure emoji data is updated
demoji.download_codes()

# List of emojis to convert
emoji_list = ["😀", "🔥", "❤️", "😂", "🚀"]

# Convert each emoji to text description
text_list = [demoji.replace_with_desc(emoji) for emoji in emoji_list]

# Printing the results
for emoji, text in zip(emoji_list, text_list):
    print(f"Emoji: {emoji} -> Text: {text}")

Emoji: 😀 -> Text: :grinning face:
Emoji: 🔥 -> Text: :fire:
Emoji: ❤️ -> Text: :red heart:
Emoji: 😂 -> Text: :face with tears of joy:
Emoji: 🚀 -> Text: :rocket:


  demoji.download_codes()


In [24]:
for i in range(13) :
    emote_df[f'c_{i+1}_text'] = emote_df[f'c_{i+1}'].apply(lambda x: demoji.replace_with_desc(x)[1:-1] if x != '' else x)

In [35]:
emote_df.iloc[90]

input_emoticon                              🙯🚚😑😛🛌😣🚼🛐🙩🙯😣🙼😑
label                                                   0
characters        [🙯, 🚚, 😑, 😛, 🛌, 😣, 🚼, 🛐, 🙩, 🙯, 😣, 🙼, 😑]
c_1                                                     🙯
c_2                                                     🚚
c_3                                                     😑
c_4                                                     😛
c_5                                                     🛌
c_6                                                     😣
c_7                                                     🚼
c_8                                                     🛐
c_9                                                     🙩
c_10                                                    🙯
c_11                                                    😣
c_12                                                    🙼
c_13                                                    😑
c_1_text                                                 
c_2_text      

In [36]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained model and tokenizer (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [56]:
def text2embed(text) :
    # Tokenize the input text and convert to input IDs and attention masks
    inputs = tokenizer(text, return_tensors='pt')

    # Generate embeddings (hidden states)
    with torch.no_grad():
        outputs = model(**inputs)

    # The embeddings are in the last hidden state
    # Shape: (batch_size, sequence_length, hidden_size)
    embeddings = outputs.last_hidden_state

    # To get a single embedding for the sentence, we typically pool the embeddings:
    # Mean pooling across the sequence length axis (axis=1)
    sentence_embedding = torch.mean(embeddings, dim=1)

    # print(sentence_embedding)
    return sentence_embedding.numpy().reshape(-1)

In [57]:
embed = text2embed('who is gay')

In [63]:
X = pd.DataFrame(columns=[f'c_{i+1}_embed' for i in range(13)])

In [64]:
X.head()

Unnamed: 0,c_1_embed,c_2_embed,c_3_embed,c_4_embed,c_5_embed,c_6_embed,c_7_embed,c_8_embed,c_9_embed,c_10_embed,c_11_embed,c_12_embed,c_13_embed


In [62]:
for i in range(13) :
    X[f'c_{i+1}_embed'] = emote_df[f'c_{i+1}_text'].apply(lambda x: text2embed(x) if x != '' else np.zeros(768))

KeyboardInterrupt: 

In [None]:
y = emote_df['label'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## XGBOOST

In [None]:
import numpy as np
from xgboost import XGBClassifier

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
def get_classification_report(y_true, y_pred):
    return classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])

In [None]:
y_pred = xgb_model.predict(X_test)

report = get_classification_report(y_test, y_pred)
print(report)