In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
sys.path.append(os.path.abspath('../common')) # add path to common functions

In [3]:
from preprocess import getdfs

In [8]:
train_df, val_df = getdfs('emoticon')

In [9]:
train_df.head()

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,label
0,😛,🛐,😻,😑,😣,🙠,🙯,🚼,😒,🙼,😑,🙯,😣,0
1,🛐,😑,😪,😛,🚼,🙯,😣,🚅,😑,🙯,😹,😣,🙼,0
2,😛,🙯,😑,🚡,😣,🚼,🛐,🙲,😣,🙯,🛑,😑,🙼,0
3,😛,🚼,🛐,🙐,😣,🙯,😑,🙪,😑,🙼,🛆,😣,🙯,1
4,🛐,🚟,🚼,😛,🙋,😑,😣,🙯,😹,🙯,😑,😣,🙼,1


## Emoji to Vector

In [10]:
import gensim.models as gsm

e2v = gsm.KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)

In [11]:
X_train = X_val = pd.DataFrame(columns=[f'c_{i+1}' for i in range(13)])

In [12]:
def embed_df(df : pd.DataFrame, embedding_dim : int = 300, num_columns: int = 13)-> pd.DataFrame :

    # Initialize a list to store rows of combined embeddings
    combined_embeddings = []

    # Iterate over each row in the dataframe
    for idx, row in df.iterrows():
        row_embeddings = []
        
        # Iterate over the 13 columns and get the embedding for each
        for i in range(num_columns):
            emoji = row[f'c_{i+1}']
            
            # If the text is not empty, fetch its embedding, else use a zero vector
            if emoji in e2v:
                embedding = e2v[emoji]
            else:
                embedding = np.zeros(embedding_dim)
            
            # Append the embedding to the row list
            row_embeddings.append(embedding)
        
        # Concatenate the embeddings for the row
        combined_embeddings.append(np.hstack(row_embeddings))

    # Convert the list of rows into a NumPy array
    X = np.vstack(combined_embeddings)
    
    return X



In [13]:
x_train = embed_df(train_df)
x_valid = embed_df(val_df)

In [14]:
y_train = train_df['label']
y_valid = val_df['label']

In [15]:
import matplotlib.pyplot as plt
from models import predict_random_forest, predict_logistic_regression, predict_xgboost, predict_mlp, predict_svc
from evaluate import evaluate_predictions

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(20, 4))  # 1 row, 5 columns grid

y_pred_rf = predict_random_forest(x_train, y_train, x_valid)
evaluate_predictions(y_valid, y_pred_rf, 'Random Forest', ax=axes[0])

y_pred_xgb = predict_xgboost(x_train, y_train, x_valid)
evaluate_predictions(y_valid, y_pred_xgb, 'XGBoost', ax=axes[1])

y_pred_lr = predict_logistic_regression(x_train, y_train, x_valid)
evaluate_predictions(y_valid, y_pred_lr, 'Logistic Regression', ax=axes[2])

y_pred_mlp = predict_mlp(x_train, y_train, x_valid)
evaluate_predictions(y_valid, y_pred_mlp, 'MLP', ax = axes[3])

y_pred_svc = predict_svc(x_train, y_train, x_valid)
evaluate_predictions(y_valid, y_pred_svc, 'SVC', ax = axes[4])

plt.tight_layout()
plt.show()
