In [17]:
# Suppress TensorFlow logs for cleaner output & ignore warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
warnings.filterwarnings('ignore')

# Import all the libraries needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional, Dropout
from sklearn.metrics import classification_report, confusion_matrix

1. LSTM implementation

In [None]:
# Download stopwords
nltk.download('stopwords', download_dir='/kaggle/working/nltk_data')
nltk.data.path.append('/kaggle/working/nltk_data')
english_stops = set(stopwords.words('english'))

# Load dataset
train_data = pd.read_csv('phm_train.csv')  # Update path
test_data = pd.read_csv('phm_test.csv')    # Update path

# Preprocessing function
def load_dataset(data):
    x_data = data['tweet']
    y_data = data['label']

    # Remove HTML tags and non-alphabet characters
    x_data = x_data.replace({'<.*?>': ''}, regex=True)
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex=True)

    # Remove stopwords and convert to lowercase
    x_data = x_data.apply(lambda review: [w.lower() for w in review.split() if w.lower() not in english_stops])

    # Encode sentiment labels
    # y_data = y_data.replace('positive', 1)
    # y_data = y_data.replace('negative', 0).infer_objects(copy=False)

    return x_data, y_data

x_train, y_train = load_dataset(train_data)
x_test, y_test = load_dataset(test_data)

# Convert Tokens to Text Strings
x_train = x_train.apply(lambda x: ' '.join(x))
x_test = x_test.apply(lambda x: ' '.join(x))

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=250, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=250, padding='post', truncating='post')

# Define the LSTM model
LSTM_model = Sequential()
LSTM_model.add(Embedding(input_dim=10000, output_dim=64))
LSTM_model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
LSTM_model.add(Dense(1, activation='sigmoid'))

# Compile LSTM_model
LSTM_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Train the LSTM_model
LSTM_model.fit(x_train_pad, y_train, epochs=10, batch_size=16, validation_data=(x_test_pad, y_test))

# Evaluate on test data
loss, accuracy = LSTM_model.evaluate(x_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 346ms/step - accuracy: 0.6888 - loss: 0.6295 - val_accuracy: 0.7097 - val_loss: 0.6026
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 338ms/step - accuracy: 0.7045 - loss: 0.6095 - val_accuracy: 0.7097 - val_loss: 0.6026
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 344ms/step - accuracy: 0.7174 - loss: 0.5972 - val_accuracy: 0.7097 - val_loss: 0.6028
Epoch 4/10
[1m 52/625[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:07[0m 327ms/step - accuracy: 0.7399 - loss: 0.5810

In [None]:
# Predict on test data
y_pred_probs = LSTM_model.predict(x_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert probabilities to binary labels
print()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print()

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Fit histories
history_lstm = LSTM_model

2. Bi-LSTM Implementation

In [None]:
# Download stopwords
nltk.download('stopwords', download_dir='/kaggle/working/nltk_data')
nltk.data.path.append('/kaggle/working/nltk_data')
english_stops = set(stopwords.words('english'))

# Load dataset
train_data = pd.read_csv('phm_train.csv')  # Update path
test_data = pd.read_csv('phm_test.csv')    # Update path

# Preprocessing function
def load_dataset(data):
    x_data = data['tweet']
    y_data = data['label']

    # Remove HTML tags and non-alphabet characters
    x_data = x_data.replace({'<.*?>': ''}, regex=True)
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex=True)

    # Remove stopwords and convert to lowercase
    x_data = x_data.apply(lambda review: [w.lower() for w in review.split() if w.lower() not in english_stops])

    # Encode sentiment labels
    # y_data = y_data.replace('positive', 1)
    # y_data = y_data.replace('negative', 0).infer_objects(copy=False)

    return x_data, y_data

x_train, y_train = load_dataset(train_data)
x_test, y_test = load_dataset(test_data)

# Convert Tokens to Text Strings
x_train = x_train.apply(lambda x: ' '.join(x))
x_test = x_test.apply(lambda x: ' '.join(x))

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=200, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=200, padding='post', truncating='post')

# Define the Bi-LSTM model
BILSTM_model = Sequential()
BILSTM_model.add(Embedding(input_dim=10000, output_dim=64))  # Increased embedding size
BILSTM_model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Return full sequence
BILSTM_model.add(Dropout(0.5))
BILSTM_model.add(Bidirectional(LSTM(32)))
BILSTM_model.add(Dense(1, activation='sigmoid'))

# Compile BILSTM_model
BILSTM_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Train the BILSTM_model
BILSTM_model.fit(x_train_pad, y_train, epochs=10, batch_size=128, validation_data=(x_test_pad, y_test))

# Evaluate on test data
loss, accuracy = BILSTM_model.evaluate(x_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
# Predict & confusion matrix
y_pred_prob = BILSTM_model.predict(x_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)
print()

# Classification report
print(classification_report(y_test, y_pred))
print()

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Fit histories
history_bilstm = BILSTM_model

3. Comparison

In [None]:
# Replace with your actual history variable names if different
def plot_accuracy(history_lstm, history_bilstm):
    plt.figure(figsize=(12, 6))

    # Plot LSTM accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history_lstm.history['accuracy'], label='Train Accuracy')
    plt.plot(history_lstm.history['val_accuracy'], label='Validation Accuracy')
    plt.title('LSTM Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)

    # Plot Bi-LSTM accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history_bilstm.history['accuracy'], label='Train Accuracy')
    plt.plot(history_bilstm.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Bi-LSTM Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()