In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout,SimpleRNN,GRU,Bidirectional
from sklearn.metrics import accuracy_score

# Load the dataset
emoticon_df = pd.read_csv('datasets/train/train_emoticon.csv')
valid_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
# Step 1: Identify the target emojis
relevant_emojis = ['😛', '🛐', '🚼', '🙼','🙯','😣','😑']
# relevant_emojis = ['😛', '🛐']

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(emoticon_df['input_emoticon'])
print(sequences[0])
sequences = tokenizer.texts_to_sequences(emoticon_df['input_emoticon'])
val_sequences = tokenizer.texts_to_sequences(valid_emoticon_df['input_emoticon'])
def remove_relevant_emojis(sequence):
    return [emoji for emoji in sequence if tokenizer.index_word[emoji] not in relevant_emojis]

print(sequences[0])

modified_sequences = [remove_relevant_emojis(seq) for seq in sequences]
modified_val_sequences = [remove_relevant_emojis(seq) for seq in val_sequences]
# Padding the sequences
max_len = max(len(seq) for seq in modified_sequences)
X_padded = pad_sequences(modified_sequences, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(modified_val_sequences, maxlen=max_len, padding='post')


print(X_padded[0])
print(X_val_padded[0])

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(emoticon_df['label'])
y_val_encoded = label_encoder.fit_transform(valid_emoticon_df['label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)
print(X_train[0])

model = Sequential()

# Embedding layer with pre-trained embeddings (trainable=False)
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_len, trainable=False))

# Convolutional layer with fewer filters
model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
# model.add(MaxPooling1D(pool_size=1))

# LSTM layer with fewer units
model.add((LSTM(16, return_sequences=False)))

# Dense layers with Dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=8, verbose=1)
y_pred = (model.predict(X_val_padded) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_val_encoded, y_pred)
print(f"Neural Network Test Accuracy: {accuracy * 100:.2f}%")
model.summary()



[4, 5, 32, 1, 2, 127, 3, 6, 18, 7, 1, 3, 2]
[ 32 127  18]
[ 44 194  75]
[ 84 206  76]
Epoch 1/30




[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 783us/step - accuracy: 0.5075 - loss: 0.6912
Epoch 2/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 782us/step - accuracy: 0.8064 - loss: 0.4072
Epoch 3/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 796us/step - accuracy: 0.9194 - loss: 0.1875
Epoch 4/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 821us/step - accuracy: 0.9388 - loss: 0.1391
Epoch 5/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 819us/step - accuracy: 0.9570 - loss: 0.1060
Epoch 6/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 808us/step - accuracy: 0.9596 - loss: 0.0903
Epoch 7/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 797us/step - accuracy: 0.9630 - loss: 0.0838
Epoch 8/30
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 805us/step - accuracy: 0.9711 - loss: 0.0758
Epoch 9/30
[1m708/708[0m [32m━━━