In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score

# Load the dataset
emoticon_df = pd.read_csv('datasets/train/train_emoticon.csv')

# Step 1: Identify the target emojis
relevant_emojis = ['😛', '🛐', '🚼', '🙼','🙯','😣','😑']

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(emoticon_df['input_emoticon'])
# Create a mapping of relevant emojis to their indices
emoji_indices = {emoji: tokenizer.word_index[emoji] for emoji in relevant_emojis if emoji in tokenizer.word_index}
print(emoji_indices)
# Function to replace unwanted emojis with 0
sequences = tokenizer.texts_to_sequences(emoticon_df['input_emoticon'])
def replace_unwanted_emojis(sequence):
    return [0 if tokenizer.index_word[emoji]  in relevant_emojis else emoji for emoji in sequence]
def remove_relevant_emojis(sequence):
    return [emoji for emoji in sequence if tokenizer.index_word[emoji] not in relevant_emojis]
# Step 3: Convert each emoji into a sequence of Unicode points
print(sequences[0])
# modified_sequences = [replace_unwanted_emojis([tokenizer.index_word[index] for index in seq if index in tokenizer.index_word]) for seq in sequences]

# # Convert modified sequences back to indices
# modified_sequences_indices = [[emoji_indices.get(emoji, 0) for emoji in seq] for seq in modified_sequences]
modified_sequences = [remove_relevant_emojis(seq) for seq in sequences]

# Padding the sequences
max_len = max(len(seq) for seq in modified_sequences)
X_padded = pad_sequences(modified_sequences, maxlen=max_len, padding='post')

# Replace unwanted emojis with 0
# modified_sequences = [replace_unwanted_emojis(seq) for seq in sequences]
print(X_padded[0])
# Pad sequences to ensure uniform input lengths
# max_len = 13  # Assuming a fixed length of 13
# X_padded = pad_sequences(modified_sequences, maxlen=max_len, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(emoticon_df['label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)
print(X_train[0])
# model = Sequential()
# model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_len,trainable=False))
# model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(LSTM(32, return_sequences=True))  # Change LSTM units and return_sequences
# model.add(LSTM(16))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define a CNN-LSTM model with fewer parameters
model = Sequential()

# Embedding layer with pre-trained embeddings (trainable=False)
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_len, trainable=False))

# Convolutional layer with fewer filters
model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))

# LSTM layer with fewer units
model.add(LSTM(16, return_sequences=False))

# Dense layers with Dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=128, verbose=1)
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Neural Network Test Accuracy: {accuracy * 100:.2f}%")
model.summary()


2024-10-06 10:53:25.492962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-06 10:53:25.506378: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-06 10:53:25.510289: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-06 10:53:25.520776: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'😛': 4, '🛐': 5, '🚼': 6, '🙼': 7, '🙯': 3, '😣': 2, '😑': 1}
[4, 5, 32, 1, 2, 127, 3, 6, 18, 7, 1, 3, 2]
[ 32 127  18]
[ 84 206  76]
Epoch 1/20


I0000 00:00:1728192206.795683   71169 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-06 10:53:26.799957: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5031 - loss: 0.6931
Epoch 2/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5560 - loss: 0.6919
Epoch 3/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6559 - loss: 0.6773
Epoch 4/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7428 - loss: 0.5515
Epoch 5/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7968 - loss: 0.4362
Epoch 6/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8581 - loss: 0.3307
Epoch 7/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9032 - loss: 0.2359
Epoch 8/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9362 - loss: 0.1655
Epoch 9/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1