<a href="https://colab.research.google.com/github/AbeerProg/RRDS/blob/main/GloVe_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE

# Load your dataset
df = pd.read_excel('Final_dataset.xlsx')

# Preprocess text and numerical data
texts = df['review_text']  # Replace 'review_text' with your actual text column
numerical_features = df.iloc[:, 1:11].values  # 7 continuous + 3 boolean features
labels = df['Label'].values  # Target variable (0 or 1)

# Tokenize and pad text sequences
max_len = 100  # You can adjust this based on your dataset
vocab_size = 10000  # Limit vocabulary size for RAM optimization

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Split the data into training and test sets
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    padded_sequences, numerical_features, labels, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)


In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout

# Build the LSTM model with GloVe embeddings
def build_lstm_model(vocab_size, max_len, num_numerical_features, embedding_dim, embedding_matrix):
    # Text input branch
    text_input = Input(shape=(max_len,), name='text_input')
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                                weights=[embedding_matrix], input_length=max_len, trainable=False)(text_input)
    lstm_layer = LSTM(128)(embedding_layer)

    # Numerical input branch
    num_input = Input(shape=(num_numerical_features,), name='num_input')

    # Combine the text and numerical branches
    combined = Concatenate()([lstm_layer, num_input])

    # Dense layers for final classification
    dense = Dense(64, activation='relu')(combined)
    dropout = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=[text_input, num_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Build and compile the LSTM model
lstm_model = build_lstm_model(vocab_size, max_len, X_train_num.shape[1], embedding_dim, embedding_matrix)

# Train the model
lstm_history = lstm_model.fit([X_train_text, X_train_num], y_train, validation_split=0.2, epochs=10, batch_size=32)

# Evaluate the model on the test set
lstm_loss, lstm_accuracy = lstm_model.evaluate([X_test_text, X_test_num], y_test)
print(f"LSTM Test Accuracy: {lstm_accuracy:.4f}")


Epoch 1/10




[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 69ms/step - accuracy: 0.7690 - loss: 0.4876 - val_accuracy: 0.8752 - val_loss: 0.3081
Epoch 2/10
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 69ms/step - accuracy: 0.8479 - loss: 0.3422 - val_accuracy: 0.8650 - val_loss: 0.3054
Epoch 3/10
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.8587 - loss: 0.3263 - val_accuracy: 0.8838 - val_loss: 0.2799
Epoch 4/10
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.8634 - loss: 0.3157 - val_accuracy: 0.8861 - val_loss: 0.2761
Epoch 5/10
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.8690 - loss: 0.2968 - val_accuracy: 0.8848 - val_loss: 0.2720
Epoch 6/10
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 67ms/step - accuracy: 0.8705 - loss: 0.3017 - val_accuracy: 0.8970 - val_loss: 0.2548
Epoch 7/10
[1m505/505[0m 

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Predict on the test set
y_pred_proba = lstm_model.predict([X_test_text, X_test_num])  # Predict probabilities
y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print the metrics
print(f"LSTM Model Performance Metrics:")
print(f"Accuracy   : {accuracy:.4f}")
print(f"Precision  : {precision:.4f}")
print(f"Recall     : {recall:.4f}")
print(f"F1-Score   : {f1:.4f}")
print(f"AUC-ROC    : {auc_roc:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step
LSTM Model Performance Metrics:
Accuracy   : 0.8985
Precision  : 0.9057
Recall     : 0.8465
F1-Score   : 0.8751
AUC-ROC    : 0.9625

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      2491
           1       0.91      0.85      0.88      1805

    accuracy                           0.90      4296
   macro avg       0.90      0.89      0.89      4296
weighted avg       0.90      0.90      0.90      4296

