In [1]:
from collections import Counter
import bz2
import tensorflow as tf
import re
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:

def assign_labels_and_comments(file, subset_size):
    labels = []
    comments = []
    
    for i, line in enumerate(bz2.BZ2File(file)):
        if i >= subset_size:
            break
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        comments.append(x[10:].strip())
    return np.array(labels), comments

training_size = 3600000 // 3   # 1,200,000
testing_size = 400000 // 2      # 200,000

train_labels, train_comments = assign_labels_and_comments("/kaggle/input/amazonreviews/train.ft.txt.bz2", training_size )

test_labels, test_comments = assign_labels_and_comments("/kaggle/input/amazonreviews/test.ft.txt.bz2", testing_size )

In [3]:
print("Training size :", len(train_comments))
print("Tesing size :", len(test_comments))

print("First 5 training comments and labels:")
for i in range(5):
    print(f"Label: {train_labels[i]}, Comment: {train_comments[i]}")

Training size : 1200000
Tesing size : 200000
First 5 training comments and labels:
Label: 1, Comment: Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
Label: 1, Comment: The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, thi

In [4]:
not_alphanumeric = re.compile(r'[\W]')
not_ascii = re.compile(r'[^a-z0-1\s]')

In [5]:
def processed_comments(texts):
    processed_comments = []
    for text in texts:
        lower = text.lower()
        no_punctuation = not_alphanumeric.sub(r' ', lower)
        no_non_ascii = not_ascii.sub(r'', no_punctuation)
        processed_comments.append(no_non_ascii)
    return processed_comments
train_comments = processed_comments(train_comments)
test_comments = processed_comments(test_comments)

print("\nFirst 5 preprocessed training comments:")
for i in range(5):
    print(f"Comment: {train_comments[i]}")


First 5 preprocessed training comments:
Comment: stuning even for the non gamer  this sound track was beautiful  it paints the senery in your mind so well i would recomend it even to people who hate vid  game music  i have played the game chrono cross but out of all of the games i have ever played it has the best music  it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  it would impress anyone who cares to listen    
Comment: the best soundtrack ever to anything   i m reading a lot of reviews saying that this is the best  game soundtrack  and i figured that i d write a review to disagree a bit  this in my opinino is yasunori mitsuda s ultimate masterpiece  the music is timeless and i m been listening to it for years now and its beauty simply refuses to fade the price tag on this is pretty staggering i must say  but if you are going to buy any cd for this much money  this is the only one that i feel would be worth every penny 
Comme

In [6]:
train_data, val_data, train_labels, val_labels = train_test_split(train_comments, train_labels, random_state=42, test_size=0.2)


In [7]:
val_data, test_data, val_labels, test_labels = train_test_split(val_data, val_labels, random_state=42, test_size=0.5)

In [8]:
print("Train size: ",len(train_data))
print("Val size: ",len(val_data))


Train size:  960000
Val size:  120000


In [9]:
print("test size: ",len(test_data))


test size:  120000


In [10]:
import collections
import numpy as np

def build_vocab(texts, max_features):
    counter = collections.Counter(word for text in texts for word in text.split())
    vocab = {word: i+1 for i, (word, _) in enumerate(counter.most_common(max_features - 1))}
    vocab['<UNK>'] = max_features - 1
    return vocab

def text_to_sequence(texts, vocab):
    return [[vocab.get(word, vocab['<UNK>']) for word in text.split()] for text in texts]

def pad_sequences(sequences, maxlen):
    return np.array([seq[:maxlen] + [0] * (maxlen - len(seq)) for seq in sequences])

# Maximum features
maximum_features = 14000

# Build vocabulary
vocab = build_vocab(train_data, maximum_features)

# Tokenize text to sequences
train_comments_seq = text_to_sequence(train_data, vocab)
val_comments_seq = text_to_sequence(val_data, vocab)
test_comments_seq = text_to_sequence(test_data, vocab)

# Remove empty sequences and corresponding labels
def remove_empty(sequences, labels):
    return zip(*[(seq, label) for seq, label in zip(sequences, labels) if seq])

train_comments_seq, train_labels = remove_empty(train_comments_seq, train_labels)
val_comments_seq, val_labels = remove_empty(val_comments_seq, val_labels)
test_comments_seq, test_labels = remove_empty(test_comments_seq, test_labels)

# Convert from tuple to lists
train_comments_seq, train_labels = list(train_comments_seq), list(train_labels)
val_comments_seq, val_labels = list(val_comments_seq), list(val_labels)
test_comments_seq, test_labels = list(test_comments_seq), list(test_labels)

# Find maximum sequence length
maximum_length = max(map(len, train_comments_seq + val_comments_seq + test_comments_seq))

# Pad sequences
train_comments_pad = pad_sequences(train_comments_seq, maximum_length)
val_comments_pad = pad_sequences(val_comments_seq, maximum_length)
test_comments_pad = pad_sequences(test_comments_seq, maximum_length)


In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_comments_pad, train_labels)).batch(128)
val_dataset = tf.data.Dataset.from_tensor_slices((val_comments_pad, val_labels)).batch(128)
test_dataset = tf.data.Dataset.from_tensor_slices((test_comments_pad, test_labels)).batch(128)

In [12]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

print(train_comments_pad.shape[0])
print(train_labels.shape[0])
print(val_comments_pad.shape[0])
print(val_labels.shape[0])
print(test_comments_pad.shape[0])
print(test_labels.shape[0])

959998
959998
120000
120000
120000
120000


In [13]:
from collections import Counter

print(Counter(train_labels))  # Check the distribution of training labels
print(Counter(val_labels))    # Check the distribution of validation labels


Counter({1: 484401, 0: 475597})
Counter({1: 60666, 0: 59334})


In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=maximum_features, output_dim=128, input_length=maximum_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', 
    metrics = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
        
]
)




In [16]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[
                  'accuracy', 
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall')
              ]
)

history = model.fit(train_comments_pad, train_labels, 
                    epochs=5, 
                    batch_size=128, 
                    validation_data=(val_comments_pad, val_labels), 
                    verbose=1)


Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m522s[0m 68ms/step - accuracy: 0.8981 - loss: 0.2468 - precision: 0.9000 - recall: 0.8972 - val_accuracy: 0.9391 - val_loss: 0.1577 - val_precision: 0.9289 - val_recall: 0.9524
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m516s[0m 69ms/step - accuracy: 0.9480 - loss: 0.1427 - precision: 0.9482 - recall: 0.9487 - val_accuracy: 0.9447 - val_loss: 0.1467 - val_precision: 0.9603 - val_recall: 0.9289
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 69ms/step - accuracy: 0.9587 - loss: 0.1165 - precision: 0.9589 - recall: 0.9592 - val_accuracy: 0.9469 - val_loss: 0.1424 - val_precision: 0.9577 - val_recall: 0.9364
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 70ms/step - accuracy: 0.9675 - loss: 0.0952 - precision: 0.9677 - recall: 0.9678 - val_accuracy: 0.9461 - val_loss: 0.1498 - val_precision: 0.9541 - val_recall: 0.9386
Epoch 5/

In [17]:
model.evaluate(test_dataset)

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 26ms/step - accuracy: 0.9470 - loss: 0.1561 - precision: 0.9501 - recall: 0.9441


[0.1596880853176117, 0.946150004863739, 0.9494239687919617, 0.9430623054504395]

In [18]:
y_pred= model.predict(test_comments_pad)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 13ms/step


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_pred contains probabilities; convert to binary predictions
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

# Calculate metrics
y_true = test_labels  # Ensure this is binary as well
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9494239438971448
Recall: 0.9430623279264935
F1 Score: 0.9462324435865008


# **LeakyReLU**

In [20]:
from tensorflow.keras.layers import LeakyReLU

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=maximum_features, output_dim=128, input_length=maximum_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64),
    LeakyReLU(alpha=0.1),  
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[
                  'accuracy', 
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall')
              ]
)

history = model.fit(train_comments_pad, train_labels, 
                    epochs=5, 
                    batch_size=128, 
                    validation_data=(val_comments_pad, val_labels), 
                    verbose=1)




Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 69ms/step - accuracy: 0.9036 - loss: 0.2364 - precision: 0.9053 - recall: 0.9047 - val_accuracy: 0.9417 - val_loss: 0.1534 - val_precision: 0.9456 - val_recall: 0.9387
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 69ms/step - accuracy: 0.9480 - loss: 0.1418 - precision: 0.9491 - recall: 0.9480 - val_accuracy: 0.9463 - val_loss: 0.1430 - val_precision: 0.9521 - val_recall: 0.9411
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m516s[0m 69ms/step - accuracy: 0.9589 - loss: 0.1154 - precision: 0.9586 - recall: 0.9599 - val_accuracy: 0.9468 - val_loss: 0.1437 - val_precision: 0.9558 - val_recall: 0.9381
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 69ms/step - accuracy: 0.9674 - loss: 0.0940 - precision: 0.9674 - recall: 0.9679 - val_accuracy: 0.9464 - val_loss: 0.1487 - val_precision: 0.9451 - val_recall: 0.9491
Epoch 5/

In [21]:
model.evaluate(test_dataset)

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 26ms/step - accuracy: 0.9456 - loss: 0.1722 - precision: 0.9416 - recall: 0.9507


[0.17651091516017914,
 0.944433331489563,
 0.9409307837486267,
 0.9489833116531372]

In [22]:
y_pred= model.predict(test_comments_pad)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 13ms/step


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_pred contains probabilities; convert to binary predictions
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

# Calculate metrics
y_true = test_labels  # Ensure this is binary as well
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9409307679657951
Recall: 0.9489833150893953
F1 Score: 0.9449398863786498


# **Stochastic Gradient Descent (SGD):**

In [24]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

model.compile(optimizer=optimizer, loss='binary_crossentropy', 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
history = model.fit(train_comments_pad, train_labels, 
                    epochs=5, 
                    batch_size=128, 
                    validation_data=(val_comments_pad, val_labels), 
                    verbose=1)


Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 67ms/step - accuracy: 0.9839 - loss: 0.0512 - precision: 0.9833 - recall: 0.9849 - val_accuracy: 0.9435 - val_loss: 0.2100 - val_precision: 0.9468 - val_recall: 0.9412
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 67ms/step - accuracy: 0.9855 - loss: 0.0472 - precision: 0.9851 - recall: 0.9863 - val_accuracy: 0.9434 - val_loss: 0.2140 - val_precision: 0.9412 - val_recall: 0.9473
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m498s[0m 66ms/step - accuracy: 0.9864 - loss: 0.0443 - precision: 0.9861 - recall: 0.9871 - val_accuracy: 0.9429 - val_loss: 0.2216 - val_precision: 0.9410 - val_recall: 0.9464
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m514s[0m 68ms/step - accuracy: 0.9871 - loss: 0.0428 - precision: 0.9865 - recall: 0.9879 - val_accuracy: 0.9424 - val_loss: 0.2281 - val_precision: 0.9423 - val_recall: 0.9438
Epoch 5/

In [25]:
model.evaluate(test_dataset)

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 26ms/step - accuracy: 0.9447 - loss: 0.2337 - precision: 0.9455 - recall: 0.9445


[0.24067914485931396,
 0.9433833360671997,
 0.9441898465156555,
 0.9430623054504395]

In [26]:
y_pred= model.predict(test_comments_pad)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 13ms/step


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_pred contains probabilities; convert to binary predictions
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

# Calculate metrics
y_true = test_labels  # Ensure this is binary as well
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9441898309587858
Recall: 0.9430623279264935
F1 Score: 0.9436257426399813


 # **tanh**

In [28]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=maximum_features, output_dim=128, input_length=maximum_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='tanh'),  # Tanh activation function
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

model.compile(optimizer=optimizer, loss='binary_crossentropy', 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
history = model.fit(train_comments_pad, train_labels, 
                    epochs=5, 
                    batch_size=128, 
                    validation_data=(val_comments_pad, val_labels), 
                    verbose=1)




Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 66ms/step - accuracy: 0.7152 - loss: 0.5003 - precision_1: 0.7108 - recall_1: 0.7458 - val_accuracy: 0.8868 - val_loss: 0.2762 - val_precision_1: 0.9405 - val_recall_1: 0.8284
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 66ms/step - accuracy: 0.8871 - loss: 0.2775 - precision_1: 0.8894 - recall_1: 0.8865 - val_accuracy: 0.8826 - val_loss: 0.2988 - val_precision_1: 0.8691 - val_recall_1: 0.9040
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 66ms/step - accuracy: 0.8805 - loss: 0.2888 - precision_1: 0.8825 - recall_1: 0.8802 - val_accuracy: 0.9173 - val_loss: 0.2101 - val_precision_1: 0.9176 - val_recall_1: 0.9189
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 66ms/step - accuracy: 0.9175 - loss: 0.2104 - precision_1: 0.9205 - recall_1: 0.9157 - val_accuracy: 0.9244 - val_loss: 0.1921 - val_precision_1: 0.9271

In [29]:
model.evaluate(test_dataset)

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 26ms/step - accuracy: 0.9305 - loss: 0.1796 - precision_1: 0.9311 - recall_1: 0.9306


[0.18245814740657806,
 0.9297666549682617,
 0.9306662678718567,
 0.9294623136520386]

In [30]:
y_pred= model.predict(test_comments_pad)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 13ms/step


In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_pred contains probabilities; convert to binary predictions
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

# Calculate metrics
y_true = test_labels  # Ensure this is binary as well
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9306662681015012
Recall: 0.9294623013898564
F1 Score: 0.9300638951124388


# use another learning rate + cnn-lstm

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import LeakyReLU

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=maximum_features, output_dim=128, input_length=maximum_length),
    
    tf.keras.layers.LSTM(128, return_sequences=True),  # return_sequences=True so that Conv1D can follow
    
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    
    tf.keras.layers.GlobalMaxPooling1D(),
    
    tf.keras.layers.Dense(64),
    LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(1, activation='sigmoid')  
])

custom_learning_rate = 0.0005  

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=custom_learning_rate),
              loss='binary_crossentropy',
              metrics=[
                  'accuracy',
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall')
              ])

# Train the model
history = model.fit(train_comments_pad, train_labels,
                    epochs=5,
                    batch_size=128,
                    validation_data=(val_comments_pad, val_labels),
                    verbose=1)




Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 29ms/step - accuracy: 0.8920 - loss: 0.2597 - precision: 0.8932 - recall: 0.8926 - val_accuracy: 0.9390 - val_loss: 0.1595 - val_precision: 0.9446 - val_recall: 0.9342
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 29ms/step - accuracy: 0.9464 - loss: 0.1468 - precision: 0.9466 - recall: 0.9472 - val_accuracy: 0.9447 - val_loss: 0.1466 - val_precision: 0.9489 - val_recall: 0.9413
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 29ms/step - accuracy: 0.9559 - loss: 0.1240 - precision: 0.9557 - recall: 0.9567 - val_accuracy: 0.9439 - val_loss: 0.1465 - val_precision: 0.9353 - val_recall: 0.9550
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 29ms/step - accuracy: 0.9636 - loss: 0.1037 - precision: 0.9633 - recall: 0.9646 - val_accuracy: 0.9424 - val_loss: 0.1555 - val_precision: 0.9293 - val_recall: 0.9590
Epoch 5/

In [15]:
model.evaluate(test_dataset)

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9455 - loss: 0.1574 - precision: 0.9427 - recall: 0.9491


[0.16093911230564117,
 0.9443583488464355,
 0.9416484236717224,
 0.9480047821998596]

In [16]:
y_pred= model.predict(test_comments_pad)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score

threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

# Calculate metrics
y_true = test_labels  # Ensure this is binary as well
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9416484077692295
Recall: 0.9480047765946861
F1 Score: 0.9448159014835323




### 1. **Bidirectional LSTM with LeakyReLU**
   - **Strengths**: The LeakyReLU allows for better gradient flow than standard ReLU, reducing the likelihood of vanishing gradients. Using bidirectional LSTMs helps capture both past and future dependencies in sequence data.
   - **Performance**: In your training, this model had good precision and recall (F1 Score around 94.49%). It's effective when sequence dependencies are important, and the dropout layer helps prevent overfitting.
   - **Use Case**: This model is particularly strong when understanding context in sequential data is critical, such as in natural language processing tasks (e.g., text classification, sentiment analysis).

### 2. **Bidirectional LSTM with Tanh**
   - **Strengths**: Tanh is often used in LSTM gates and helps model the signal in both directions (positive and negative). This can capture more complex patterns in the data, especially when the input features can have both positive and negative impacts.
   - **Performance**: With a similar F1 score (around 94.36%), this model performed well, but its architecture is more common in traditional LSTM setups. However, tanh can suffer from vanishing gradients when compared to LeakyReLU, which gives LeakyReLU an edge.

### 3. **SGD Optimizer with LSTM**
   - **Strengths**: Stochastic Gradient Descent (SGD) is a slower, more steady optimizer compared to Adam, and often works better with a larger learning rate and momentum in cases where Adam might overfit or overshoot. 
   - **Performance**: In your case, while SGD led to high accuracy (~98% during training), the generalization to the validation set slightly dropped compared to Adam, indicating that Adam might be a better choice for this specific task due to faster convergence and better generalization on unseen data.
   - **Use Case**: SGD is often more reliable when you have very large datasets or require fine-tuned learning over a long period. But for faster, more adaptable training like this, Adam could be preferable.

### 4. **LeakyReLU vs. Tanh**
   - **LeakyReLU**: Prevents dying neurons, helping in cases where ReLU might not pass enough gradients for optimization. It's often the preferred choice over tanh in modern deep learning because it allows gradients to flow through even negative parts of the function.
   - **Tanh**: More traditional in RNN architectures but tends to saturate and can suffer from vanishing gradient problems when the sequence is long. Tanh may not perform as well with deep networks.

### **Conclusion: Best Model**
Based on  results, **the Bidirectional LSTM with LeakyReLU and Adam optimizer** appears to be the best model, offering a good balance of gradient flow (thanks to LeakyReLU) and fast, stable convergence (due to Adam). This model has consistently provided high precision, recall, and F1 scores, and it generalizes well to validation data. For most tasks involving sequence data (like text classification or sequential prediction), this combination is likely to outperform the others. 

